In [1]:
import numpy as np
import pandas as pd
import ssl
import copy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import random
from pprint import pprint

from imblearn.over_sampling import RandomOverSampler,SMOTE, ADASYN, SMOTENC, SVMSMOTE
from imblearn.pipeline import make_pipeline

from imblearn.under_sampling import RandomUnderSampler


from sklearn.metrics import hamming_loss, roc_auc_score
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 

from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import wordcloud

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC, NuSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer


import scikitplot as skplt
import matplotlib.pyplot as plt

from keras import optimizers
from keras.losses import binary_crossentropy
from keras.metrics import binary_accuracy
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping

from scipy.special import softmax

ssl._create_default_https_context = ssl._create_unverified_context

Using TensorFlow backend.


In [2]:
def clean_statement(statement):
    statement = re.sub('$', ' ', statement)
    statement = re.sub('[^A-Za-z]+', ' ', statement)
    statement = re.sub('[,|.|?|\n]|\t', '', statement)
    statement = re.sub('n\'t', ' ', statement)
    statement = re.sub('submission|submissions|Submission|submission|th ', '', statement)
    statement = re.sub('one|two|given|need', '', statement)
    
    return statement

In [3]:
def process_problem_statement(q_statement):
    
    q_statement = clean_statement(q_statement)
    
    tokens = word_tokenize(q_statement)
    
    stoplist = set(stopwords.words('english'))
    
    word_list = [i for i in q_statement.lower().split() if i not in stoplist]
    
    ps = PorterStemmer()
    
    q_statement = ' '.join(word_list)
    
    return q_statement

In [4]:
def process_problem_solution(solution):
    tokens = word_tokenize(solution)
    stoplist = set(stopwords.words('english'))
    word_list = [i for i in solution.lower().split() if i not in stoplist]
    solution = ' '.join(word_list)
    return solution

In [5]:
def process_time_taken(time_col):
    return time_col.split()[0]

In [6]:
def process_tags(all_tags_list,tag_col):
    tags_present = list(re.split(',',tag_col))
    return tags_present

In [7]:
def get_all_distinct_tags(tags_col):
    tags_list = []
    t_sets = set(tags_list)
    for row in tags_col:
        t_list = re.split(',',row)
        t_sets = t_sets.union(set(t_list))
    tags_list = list(t_sets)
    stoplist = set(stopwords.words('english'))
    word_list = [i for i in tags_list if i not in stoplist]
    return tags_list

In [8]:
# tag list obtained from the dataset
tags_list = ['dsu', 'trees', 'chinese remainder theorem', 'sortings', 'games', 'implementation', 'bitmasks',
              '*special', 'hashing', 'geometry', 'two pointers', 'combinatorics', 'flows', 'strings',
              'probabilities', 'data structures', 'ternary search', 'greedy', 'math', 'matrices',
              'divide and conquer', 'dfs and similar', 'constructive algorithms', 'brute force', 'dp',
              '2-sat', 'graph matchings', 'binary search', 'number theory', 'graphs', 'fft', 'shortest paths',
              'schedules', 'meet-in-the-middle', 'string suffix structures', 'expression parsing']

In [9]:
def plot_class_distribution(Y,classes):  
    count_list = [0]*Y.shape[1]
    for index in range(Y.shape[1]):
        count_list[index] = np.sum(Y[:,index])/Y.shape[0]
    plt.figure(figsize=(10, 10), dpi=100)
    plt.barh(classes,count_list, align='center', alpha=0.5)
    plt.show()   

In [19]:
def data_preprocessing():
    df = pd.read_csv("./codeforces_question_v5.csv")
    df = df.drop(['id','name','author'],axis = 1)
    df = df[df.solution != "no code found"]
    df = df.dropna()
   
    global distinct_tags
    
    distinct_tags = get_all_distinct_tags(df["tags"])
    
    df["problem statement"] = [process_problem_statement(x) for x in df["problem statement"]]
    df["solution"] = [process_problem_solution(x) for x in df["solution"]]
    df["time_taken"] = [process_time_taken(x) for x in df["time_taken"]]
    
    X = copy.deepcopy(df["solution"]+df["time_taken"])
    Y = [process_tags(distinct_tags,x) for x in df["tags"]]

    mlb = MultiLabelBinarizer()
    Y = mlb.fit_transform(Y)
    print("Tags: ")
    print(mlb.classes_)
    print()
    plot_class_distribution(Y,mlb.classes_)    
    return X, Y

In [None]:
global distinct_tags
X,Y = data_preprocessing()
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [None]:
print(X)

In [None]:
print(Y)