In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import sqlite3
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import re
import os
from sqlalchemy import create_engine # database connection
import datetime as dt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from skmultilearn.adapt import mlknn
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB
from datetime import datetime

In [None]:
df = pd.read_csv("Train.csv")
df.head()

In [None]:
df.columns

In [None]:
df = df.drop_duplicates(['Title', 'Body', 'Tags'])

In [None]:
df["tag_count"] = df["Tags"].apply(lambda row : len(str(row).split(" ")))

In [None]:
df.head()

In [None]:
df.tag_count.value_counts()

In [None]:
df.dropna(inplace=True)

In [None]:
vectorizer = CountVectorizer(tokenizer= lambda text : text.split(" "))
tag_dtm = vectorizer.fit_transform(df["Tags"])

In [None]:
tags = vectorizer.get_feature_names()
tags[:10]

In [None]:
freqs = tag_dtm.sum(axis=0).A1
result = dict(zip(tags,freqs))

In [None]:
tag_df = pd.DataFrame(result.items(), columns=["Tags", "Counts"])

In [None]:
tag_df.head()

In [None]:
tag_df_sorted = tag_df.sort_values(['Counts'], ascending=False)
tag_counts = tag_df_sorted["Counts"].values

In [None]:
plt.plot(tag_counts[:100])
plt.scatter(x= list(range(0,100,5)), y = tag_counts[0:100:5], c= 'orange',label = "Quantiles with 5 % intervals")
plt.scatter(x= list(range(0,100,25)), y = tag_counts[0:100:25], c = "red", label = "Quantiles with 25th % intervals")
plt.grid()
plt.xlabel("Tag Number")
plt.ylabel("Number of times the tag Appear")
plt.show()

In [None]:
dict(result.items())

In [None]:
wordcloud = WordCloud(background_color='black',
         width = 1600,
         height = 800).generate_from_frequencies(result)
plt.figure(figsize=(30,20))
plt.imshow(wordcloud)
plt.show()

In [None]:
i = np.arange(30)
tag_df_sorted.head(30).plot(kind='bar')
plt.xticks(i, tag_df_sorted['Tags'][:30])
plt.show()

In [None]:
def striphtml(data):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr,' ',str(data))
    return cleantext

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

In [None]:
df.shape

In [None]:
random_df = df.sample(50000)

In [None]:
random_df.head()

In [None]:
random_df.shape

In [None]:
start = datetime.now()
preprocessed_data_list=[]
questions_with_code=0
len_pre=0
len_post=0
questions_proccesed = 0
prepared_df = pd.DataFrame(columns=['question','code','tags','words_pre','words_post','is_code'])
for row in random_df.iterrows():

    is_code = 0

    #As title seems very important feature Hence increasing title weight by adding it 3 times
    title, question, tags = 3*(' ' +row[1][1]), row[1][2], row[1][3]

    if '<code>' in question:
        questions_with_code+=1
        is_code = 1
    x = len(question)+len(title)
    len_pre+=x

    code = str(re.findall(r'<code>(.*?)</code>', question, flags=re.DOTALL))

    question=re.sub('<code>(.*?)</code>', '', question, flags=re.MULTILINE|re.DOTALL)
    question=striphtml(question.encode('utf-8'))

    title=title.encode('utf-8')

    question=str(title)+" "+str(question)
    question=re.sub(r'[^A-Za-z]+',' ',question)
    words=word_tokenize(str(question.lower()))

    #Removing all single letter and and stopwords from question exceptt for the letter 'c'
    question=' '.join(str(stemmer.stem(j)) for j in words if j not in stop_words and (len(j)!=1 or j=='c'))

    len_post+=len(question)
    processed_di = {
        "question": question,
        "code": code,
        "tags": tags,
        "words_pre": x,
        "words_post": len(question),
        "is_code" : is_code
    }
    
    prepared_df.loc[len(prepared_df.index)] = [question,code,tags,x,len(question),is_code]
    questions_proccesed += 1
    if (questions_proccesed%100000==0):
        print("number of questions completed=",questions_proccesed)

no_dup_avg_len_pre=(len_pre*1.0)/questions_proccesed
no_dup_avg_len_post=(len_post*1.0)/questions_proccesed

print( "Avg. length of questions(Title+Body) before processing: %d"%no_dup_avg_len_pre)
print( "Avg. length of questions(Title+Body) after processing: %d"%no_dup_avg_len_post)
print ("Percent of questions containing code: %d"%((questions_with_code*100.0)/questions_proccesed))

print("Time taken to run this cell :", datetime.now() - start)

In [None]:
prepared_df.head()

In [None]:
preprocessed_data = prepared_df[["question","tags"]]

In [None]:
preprocessed_data.head()

In [None]:
vectorizer = CountVectorizer(tokenizer= lambda text : text.split(), binary=True)
multilabel_y = vectorizer.fit_transform(preprocessed_data["tags"])

In [None]:
multilabel_y.get_shape()

In [None]:
def tags_to_choose(n):
    t = multilabel_y.sum(axis=0).tolist()[0]
    sorted_tags_i = sorted(range(len(t)), key=lambda i: t[i], reverse=True)
    multilabel_yn=multilabel_y[:,sorted_tags_i[:n]]
    return multilabel_yn

def questions_explained_fn(n):
    multilabel_yn = tags_to_choose(n)
    x= multilabel_yn.sum(axis=1)
    return (np.count_nonzero(x==0))

In [None]:
question_explained = []
total_tags = multilabel_y.shape[1]
total_qs = preprocessed_data.shape[0]

for i in range(500, total_tags, 100):
    question_explained.append(np.round(((total_qs-questions_explained_fn(i))/total_qs)*100,3))

In [None]:
fig, ax = plt.subplots()
ax.plot(question_explained)
xlabel = list(500+np.array(range(-50,450,50))*50)
ax.set_xticklabels(xlabel)
plt.xlabel("Number of tags")
plt.ylabel("Number Questions coverd partially")
plt.grid()
plt.show()
# you can choose any number of tags based on your computing power, minimun is 50(it covers 90% of the tags)
print("with ",5500,"tags we are covering ",question_explained[50],"% of questions")

In [None]:
multilabel_yx = tags_to_choose(5500)
print("number of questions that are not covered :", questions_explained_fn(5500),"out of ", total_qs)

In [None]:
multilabel_yx.get_shape()

In [None]:
print("Number of tags in sample :", multilabel_y.shape[1])
print("number of tags taken :", multilabel_yx.shape[1],"(",(multilabel_yx.shape[1]/multilabel_y.shape[1])*100,"%)")

In [None]:
total_size=preprocessed_data.shape[0]
train_size=int(0.80*total_size)

x_train=preprocessed_data.head(train_size)
x_test=preprocessed_data.tail(total_size - train_size)

y_train = multilabel_yx[0:train_size,:]
y_test = multilabel_yx[train_size:total_size,:]

In [None]:
print("Number of data points in train data :", y_train.shape)
print("Number of data points in test data :", y_test.shape)

In [None]:
tfidf_vect = TfidfVectorizer(min_df=0.00009,max_features=200000,smooth_idf=True,norm='l2',\
               tokenizer=lambda x : x.split(),sublinear_tf=False, ngram_range=(1,3) )
x_train_vectors = tfidf_vect.fit_transform(x_train['question'])
x_test_vectors = tfidf_vect.transform(x_test['question'])


In [None]:
print("Dimensions of train data X:",x_train_vectors.shape, "Y :",y_train.shape)
print("Dimensions of test data X:",x_test_vectors.shape,"Y:",y_test.shape)

In [None]:
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1)
classifier.fit(x_train_vectors,y_train)

In [None]:
predictions = classifier.predict(x_test_vectors)

In [None]:
print("accuracy ", metrics.accuracy_score(y_test,predictions))
print("macro f1 score ",metrics.f1_score(y_test,predictions, average='macro'))
print("micro f1 score ", metrics.f1_score(y_test, predictions, average='micro'))
print("hamming loss ", metrics.hamming_loss(y_test,predictions))