In [1]:
import os
import collections
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve
from sklearn.metrics import hamming_loss, roc_auc_score
from sklearn.metrics import confusion_matrix 

In [3]:
!pip install imblearn
from imblearn.over_sampling import RandomOverSampler,SMOTE, ADASYN, SMOTENC, SVMSMOTE
from imblearn.pipeline import make_pipeline



Using TensorFlow backend.


In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
# data=pd.read_csv("/content/drive/My Drive/data/questions.csv",usecols=[3,5,6]) 
data=pd.read_csv("codechef_questions_v6.csv",usecols=[3,5,6]) 
# data=data[:5000]

if len(str(data['Problem Statement']).split(".",1))>1:
  data['Problem Statement']=data['Problem Statement'].str.split(".",1).str[1]

In [6]:
data['prob_sol'] = data['Problem Statement'] +' '+ data['Solution']
data.head()

Unnamed: 0,Problem Statement,Solution,tags,prob_sol
0,Read problems statements in Mandarin Chinese...,\n #include <cstring>\n #include <string...,['game'],Read problems statements in Mandarin Chinese...
1,Read problems statements in Mandarin Chinese...,\n #include <cstring>\n #include <string...,['game'],Read problems statements in Mandarin Chinese...
2,Read problems statements in Mandarin Chinese...,\n #include <iostream>\n #include <vecto...,['game'],Read problems statements in Mandarin Chinese...
3,Read problems statements in Mandarin Chinese...,\n #include <bits/stdc++.h> \n using na...,['game'],Read problems statements in Mandarin Chinese...
4,Read problems statements in Mandarin Chinese...,\n /***************************************...,['game'],Read problems statements in Mandarin Chinese...


In [7]:
data.head()
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')
data['prob_sol'].apply(lambda x: [item for item in x if item not in stop])
print("done")

[nltk_data] Downloading package stopwords to /home/sukku/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
done


In [8]:
def clean_statement(statement):
#     x = re.sub('-', ' ', x)
    statement = re.sub('$', ' ', statement)
    statement = re.sub('[^A-Za-z]+', ' ', statement)
    statement = re.sub('[,|.|?|\n]|\t', '', statement)
    statement = re.sub('n\'t', ' ', statement)
    statement = re.sub('submission|submissions|Submission|submission|th ', '', statement)
    statement = re.sub('one|two|given|need', '', statement)
    
    return statement

In [9]:
def clean_tags(tags):
    
    tags = clean_statement(tags).strip()
    tags = list(re.split(' ',tags))
    return tags

In [10]:
def get_unique_tags(tags):
    
    tags_list = []
    tags_set = set(tags_list)
    
    for row in tags:
        tags_set = tags_set.union(set(row))
        
    tags_list = list(tags_set)
    
    return tags_list

In [11]:
# import copy

# X = copy.deepcopy(data["prob_sol"])
X = data["prob_sol"]
Y = [clean_tags(x) for x in data["tags"]]
distinct_tags = get_unique_tags(Y)

print(distinct_tags)


mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(Y)

['dijkstra', 'heap', 'regex', 'gcd', 'pattern', 'simulation', 'suffix', 'pointers', 'kruskal', 'game', 'constructive', 'map', 'combinatorics', 'geometry', 'trees', 'series', 'set', 'dp', 'segment', 'memoization', 'theory', 'interactive', 'fenwick', 'bitwise', 'enumeration', 'matching', 'greedy', 'graphs', 'knapsack', 'number', 'multiset', 'advanced', 'recurrence', 'disjoint', 'hashing', 'hard', 'array', 'bfs', 'recursion', 'basic', 'digraph', 'permutation', 'backtracking', 'sieve', 'fibonacci', 'sets', 'bitmasking', 'combinatorial', 'divide', 'algebra', 'algorithm', 'expo', 'dfs', 'implementation', 'inversions', 'tree', 'fft', 'prime', 'euler', 'tries', 'bipartite', 'probability', 'counting', 'matrix', 'strings', 'maxflow', 'bruteforce', 'adhoc', 'graph', 'parsing', 'binarysearch', 'binary', 'search', 'sorting', 'stack', 'deque', 'dynamic', 'maths']


In [12]:
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size = 0.2, random_state = 0)

## SVM Classifier

In [13]:
classifier = make_pipeline(
    CountVectorizer(ngram_range = (1,3),binary = True,lowercase=False),
    TfidfTransformer(norm = 'l2',sublinear_tf = True),
    OneVsRestClassifier(LinearSVC(penalty="l2",loss="squared_hinge",tol=1,random_state=0,max_iter=1000,C = 1)))

classifier.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=True,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=False, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, voc...
                ('tfidftransformer',
                 TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=True,
                                  use_idf=True)),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight=None,
                                               

In [14]:
def get_scores( _X , _Y ):
    predicted = classifier.predict(_X)
    _Y[int(_Y.shape[0]/2),:] =  1
    y_labels_predicted = mlb.inverse_transform(predicted)
    y_labels_actual = mlb.inverse_transform(_Y)
    
    
    print("hamming_loss: ",hamming_loss(_Y,predicted))
    print("recall_score: ",recall_score(_Y,predicted,average = 'weighted'))
    print("precision_score: ",precision_score(_Y,predicted,average = 'weighted'))
    print("f1_score: ",f1_score(_Y,predicted,average = 'weighted'))
    print("roc_auc_score: ",roc_auc_score(_Y,predicted,average = 'weighted'))
    print()

In [15]:
#For Train Data
print("Scores for Training data")
get_scores( X_train, Y_train )
print("Scores for Validation data")
get_scores( X_validation, Y_validation )

Scores for Training data
hamming_loss:  3.39503815174123e-05
recall_score:  0.998042647262009
precision_score:  0.9999311615039154
f1_score:  0.9989648595977911
roc_auc_score:  0.9990207283427975

Scores for Validation data
hamming_loss:  0.00013579743960100244
recall_score:  0.9921418128654971
precision_score:  0.9996356451023393


  _warn_prf(average, modifier, msg_start, len(result))


f1_score:  0.9956971480394666
roc_auc_score:  0.9960691235748799

