In [1]:
import os
import collections
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve
from sklearn.metrics import hamming_loss, roc_auc_score
from sklearn.metrics import confusion_matrix 

In [3]:
!pip install imblearn
from imblearn.over_sampling import RandomOverSampler,SMOTE, ADASYN, SMOTENC, SVMSMOTE
from imblearn.pipeline import make_pipeline



Using TensorFlow backend.


In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
# data=pd.read_csv("/content/drive/My Drive/data/questions.csv",usecols=[3,5,6]) 
data=pd.read_csv("codechef_questions_v6.csv",usecols=[3,5,6]) 
data=data[:5000]

if len(str(data['Problem Statement']).split(".",1))>1:
  data['Problem Statement']=data['Problem Statement'].str.split(".",1).str[1]

In [6]:
data['prob_sol'] = data['Problem Statement'] +' '+ data['Solution']
data.head()

Unnamed: 0,Problem Statement,Solution,tags,prob_sol
0,Read problems statements in Mandarin Chinese...,\n #include <cstring>\n #include <string...,['game'],Read problems statements in Mandarin Chinese...
1,Read problems statements in Mandarin Chinese...,\n #include <cstring>\n #include <string...,['game'],Read problems statements in Mandarin Chinese...
2,Read problems statements in Mandarin Chinese...,\n #include <iostream>\n #include <vecto...,['game'],Read problems statements in Mandarin Chinese...
3,Read problems statements in Mandarin Chinese...,\n #include <bits/stdc++.h> \n using na...,['game'],Read problems statements in Mandarin Chinese...
4,Read problems statements in Mandarin Chinese...,\n /***************************************...,['game'],Read problems statements in Mandarin Chinese...


In [7]:
data.head()
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')
data['prob_sol'].apply(lambda x: [item for item in x if item not in stop])
print("done")

[nltk_data] Downloading package stopwords to /home/sukku/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
done


In [8]:
def clean_statement(statement):
#     x = re.sub('-', ' ', x)
    statement = re.sub('$', ' ', statement)
    statement = re.sub('[^A-Za-z]+', ' ', statement)
    statement = re.sub('[,|.|?|\n]|\t', '', statement)
    statement = re.sub('n\'t', ' ', statement)
    statement = re.sub('submission|submissions|Submission|submission|th ', '', statement)
    statement = re.sub('one|two|given|need', '', statement)
    
    return statement

In [9]:
def clean_tags(tags):
    tags = clean_statement(tags).strip()
    tags = list(re.split(' ',tags))
    return tags

In [10]:
def get_unique_tags(tags):
    
    tags_list = []
    
    tags_set = set(tags_list)
    
    for row in tags:
        tags_set = tags_set.union(set(row))
        
    tags_list = list(tags_set)
    
    return tags_list

In [11]:
X = data["prob_sol"]
Y = [clean_tags(x) for x in data["tags"]]
distinct_tags = get_unique_tags(Y)

print(distinct_tags)


mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(Y)

['constructive', 'graphs', 'hard', 'matrix', 'implementation', 'bitwise', 'tree', 'algebra', 'prime', 'graph', 'game', 'greedy', 'expo', 'combinatorics', 'set', 'gcd', 'disjoint', 'dp', 'series', 'map', 'maths', 'matching']


In [12]:
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size = 0.2, random_state = 0)

## Logistic Regression - One Vs Rest

In [13]:
classifier = make_pipeline(
    CountVectorizer(ngram_range = (1,3),binary = True,lowercase=False),
    TfidfTransformer(norm = 'l2',sublinear_tf = True),
    OneVsRestClassifier(LogisticRegression(penalty='l2',solver='lbfgs', max_iter=100)))

for tag in distinct_tags:
    print('Processing {} problems...'.format(tag))
    
    classifier.fit(X_train, Y_train)
    
    prediction = classifier.predict(X_validation)
    print('Test accuracy is {}'.format(accuracy_score(Y_validation, prediction)))
    print("\n")

Processing constructive problems...
Test accuracy is 0.987


Processing graphs problems...
Test accuracy is 0.987


Processing hard problems...
Test accuracy is 0.987


Processing matrix problems...
Test accuracy is 0.987


Processing implementation problems...
Test accuracy is 0.987


Processing bitwise problems...
Test accuracy is 0.987


Processing tree problems...
Test accuracy is 0.987


Processing algebra problems...
Test accuracy is 0.987


Processing prime problems...
Test accuracy is 0.987


Processing graph problems...
Test accuracy is 0.987


Processing game problems...
Test accuracy is 0.987


Processing greedy problems...
Test accuracy is 0.987


Processing expo problems...
Test accuracy is 0.987


Processing combinatorics problems...
Test accuracy is 0.987


Processing set problems...
Test accuracy is 0.987


Processing gcd problems...
Test accuracy is 0.987


Processing disjoint problems...
Test accuracy is 0.987


Processing dp problems...
Test accuracy is 0.987


Proce

In [14]:
def get_scores( _X , _Y ):
    predicted = classifier.predict(_X)
    _Y[int(_Y.shape[0]/2),:] =  1
    y_labels_predicted = mlb.inverse_transform(predicted)
    y_labels_actual = mlb.inverse_transform(_Y)
    
    
    print("hamming_loss: ",hamming_loss(_Y,predicted))
    print("recall_score: ",recall_score(_Y,predicted,average = 'weighted'))
    print("precision_score: ",precision_score(_Y,predicted,average = 'weighted'))
    print("f1_score: ",f1_score(_Y,predicted,average = 'weighted'))
    print("roc_auc_score: ",roc_auc_score(_Y,predicted,average = 'weighted'))
    print()

In [15]:
#For Train Data
print("Scores for Training data")
get_scores( X_train, Y_train )
print("Scores for Validation data")
get_scores( X_validation, Y_validation )

Scores for Training data
hamming_loss:  0.0015113636363636364
recall_score:  0.9722454090150251
precision_score:  0.9922787979966611
f1_score:  0.9818492006574159
roc_auc_score:  0.9861227045075125

Scores for Validation data


  _warn_prf(average, modifier, msg_start, len(result))


hamming_loss:  0.0016363636363636363
recall_score:  0.9702970297029703
precision_score:  0.990924092409241
f1_score:  0.9802391750543763
roc_auc_score:  0.9851485148514851



  _warn_prf(average, modifier, msg_start, len(result))
