In [12]:
import numpy as np
import pandas as pd
import ssl
import copy
import matplotlib.pyplot as plt
import seaborn as sns #for data visualization
%matplotlib inline
import re
import random
from pprint import pprint

from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 

from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import wordcloud  #for data visualization

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

import scikitplot as skplt #for data visualization
import matplotlib.pyplot as plt



In [13]:
def clean_statement(statement):
    if type(statement)!= str:
        return statement
    statement = re.sub('$', ' ', statement)
    statement = re.sub('[^A-Za-z]+', ' ', statement)
    statement = re.sub('[,|.|?|\n]|\t', '', statement)
    statement = re.sub('n\'t', ' ', statement)
    statement = re.sub('submission|submissions|Submission|submission|th ', '', statement)
    statement = re.sub('one|two|given|need', '', statement)
    return statement

In [14]:
def process_problem_statement(q_statement):
    q_statement = clean_statement(q_statement)
    
    tokens = word_tokenize(q_statement)
    
    stoplist = set(stopwords.words('english'))
    
    word_list = [i for i in q_statement.lower().split() if i not in stoplist]
    
    ps = PorterStemmer()
    
    q_statement = ' '.join(word_list)
    
    return q_statement

In [15]:
def process_problem_solution(solution):
#     solution = clean_statement(solution)
    
    tokens = word_tokenize(solution)
    
    stoplist = set(stopwords.words('english'))
    
    word_list = [i for i in solution.lower().split() if i not in stoplist]
    
    solution = ' '.join(word_list)
    
    return solution

In [16]:
def process_tags(tag_col):
#     tags_present = list(re.split(',',tag_col))
    
    stoplist = set(stopwords.words('english'))
    word_list = [i for i in solution.lower().split() if i not in stoplist]
    
    tags_set = set(tags_present)
    tags_diff = tags_set.difference(set(all_tags_list))
    
    new_set = tags_set.difference(tags_diff)
    return list(new_set)
    

In [17]:
def get_all_distinct_tags(tags_col):
    print("get_all_distinct_tags")
    tags_list = []
    
    t_sets = set(tags_list)
    
    for row in tags_col:
        t_list = re.split(',',row)
        t_sets = t_sets.union(set(t_list))
    tags_list = list(t_sets)
    
    stoplist = set(stopwords.words('english'))
    
    word_list = [i for i in tags_list if i not in stoplist]
    
    return tags_list

In [18]:
def process_problem_Languages(lang_col):
    lang_col = clean_statement(lang_col)
    return lang_col

In [19]:
tags_list = ['dsu', 'trees', 'chinese remainder theorem', 'sortings', 'games', 'implementation', 'bitmasks',
              '*special', 'hashing', 'geometry', 'two pointers', 'combinatorics', 'flows', 'strings',
              'probabilities', 'data structures', 'ternary search', 'greedy', 'math', 'matrices',
              'divide and conquer', 'dfs and similar', 'constructive algorithms', 'brute force', 'dp',
              '2-sat', 'graph matchings', 'binary search', 'number theory', 'graphs', 'fft', 'shortest paths',
              'schedules', 'meet-in-the-middle', 'string suffix structures', 'expression parsing']

In [23]:
def validate_tags(tags):
    tags = eval(tags)
    i = 0
    while i<len(tags):
        if tags[i] not in tags_list:
            tags.remove(tags[i])
            continue
        i += 1
    return tags

In [24]:
def data_preprocessing():
    df = pd.read_csv("./codechef_questions.csv",encoding="ISO-8859-1")
    df = df.drop(['QCode','Title','link'],axis = 1)
    df["Languages"] = [process_problem_Languages(x) for x in df["Languages"]]
    
    for index, row in df.iterrows():
        tags = validate_tags(row['Tags'])
        if tags == []:
            df.drop(index, inplace=True)
        else:
            df.at[index, 'Tags'] = str(tags)
        
    pd.set_option("display.max_rows", None, "display.max_columns", None)
    print(df["Tags"])    

In [25]:
data_preprocessing()

2                                    ['trees']
6                                       ['dp']
17                                      ['dp']
22                ['greedy', 'implementation']
24                                  ['greedy']
28                           ['combinatorics']
36                                      ['dp']
49                          ['implementation']
52                                  ['greedy']
53                                ['geometry']
54                                     ['fft']
66                                      ['dp']
67                                      ['dp']
71                                  ['greedy']
73                                    ['math']
86                                ['geometry']
87                                 ['strings']
90                     ['combinatorics', 'dp']
99                            ['dp', 'greedy']
103                                   ['math']
106                                     ['dp']
108          