# ***Query Domain Classification***

Importing required Libraries

In [1]:
#importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import f1_score,precision_score, multilabel_confusion_matrix, accuracy_score,jaccard_score, recall_score, hamming_loss,confusion_matrix
from sklearn.multiclass import OneVsRestClassifier

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

Loading the data

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,ID,Title,Domain
0,1,"What is good in a decision tree, a large or a ...",Techniques
1,2,Training data only contains single positive label,Techniques
2,3,Calculating percentage contribution of a negat...,Techniques
3,4,Unable to open solution checker!,Hackathons
4,5,User Name Change,Misc


In [3]:
#lets check the shape of the data 
df.shape

(3845, 3)

In [4]:
#lets check the data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3845 entries, 0 to 3844
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      3845 non-null   int64 
 1   Title   3834 non-null   object
 2   Domain  3845 non-null   object
dtypes: int64(1), object(2)
memory usage: 90.2+ KB


Lets see how the Query  look like

In [5]:
df['Title'][0]

'What is good in a decision tree, a large or a small leaf size?'

In [7]:
#Create a new subset of the data by only taking the 2nd column onwards (comments and categories)
#data_count = df.iloc[:,1:].sum()

In [8]:
#data_count

Feature Engineering

Text processing

In [6]:
#importing required libraries for text processing 
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import wordnet
from nltk.corpus import wordnet as wn
#import lemmatizer


In [7]:
#function for lemmatization
def lemm(text):
    lemmatizer = wordnet.WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, wn.VERB) for word in word_tokens]
    
    return " ".join(lemmas)
#function for removing stopwords    
def rem_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in string.punctuation]
    filtered_text = [word for word in filtered_text if word not in stop_words]
    return " ".join(filtered_text)

def decontracted(text):
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"im", "i am", text)
    text = re.sub(r"yo", "you",text)
    text = re.sub(r"youu", "you",text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r'http\S+', '', text) #removing urls
    return text

In [8]:
df.head()

Unnamed: 0,ID,Title,Domain
0,1,"What is good in a decision tree, a large or a ...",Techniques
1,2,Training data only contains single positive label,Techniques
2,3,Calculating percentage contribution of a negat...,Techniques
3,4,Unable to open solution checker!,Hackathons
4,5,User Name Change,Misc


In [9]:
df.isna().sum()

ID         0
Title     11
Domain     0
dtype: int64

In [10]:
df = df.dropna()

In [11]:
df.isna().sum()

ID        0
Title     0
Domain    0
dtype: int64

In [12]:
df.dtypes

ID         int64
Title     object
Domain    object
dtype: object

In [17]:
#  df['Title'] = df.Title.apply(lambda x: x.lower()) #lowering all text


In [13]:
def text_proc(df):
    
    df['Title'] = df.Title.apply(lambda x: x.lower()) #lowering all text
    df['Title'] = df.Title.apply(lambda x: re.sub(r'\d+','',x)) #removing numbers
    df['Title'] = df.Title.apply(lambda x: re.sub(r'\n',' ',x)) #removing \n
    df['Title'] = df.Title.apply(lambda x: decontracted(x))
    df['Title'] = df.Title.apply(lambda x: lemm(x))
    
    #removing punctuations
    translator = str.maketrans('','', string.punctuation)
    df['Title'] = df.Title.apply(lambda x : x.translate(translator))
    df['Title'] = df.Title.apply(lambda x: rem_stopwords(x))
text_proc(df)

In [19]:
#import nltk
#nltk.download('all')

In [14]:
#lets have a look at data set now
df

Unnamed: 0,ID,Title,Domain
0,1,good decision tree large small leaf size,Techniques
1,2,train data contain single positive label,Techniques
2,3,calculate percentage contribution negative com...,Techniques
3,4,unable open solution checker,Hackathons
4,5,user name change,Misc
...,...,...,...
3840,3841,find practice problems linear regression,Resources
3841,3842,run java code gpu aparapi,Tools
3842,3843,rpython script need assign cluster id po recor...,Tools
3843,3844,separate column two column,Techniques


EDA

In [15]:
import wordcloud
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [16]:
def wordcloud(df, label):
    
    # Print only rows where the toxic category label value is 1 (ie. the comment is toxic)
    subset=df[df[label]==1]
    text=subset.Title.values
    wc= WordCloud(background_color="white",max_words=4000)

    wc.generate(" ".join(text))

    plt.figure(figsize=(20,20))
    plt.subplot(221)
    plt.axis("off")
    plt.title("Words frequented in {}".format(label), fontsize=20)
    plt.imshow(wc.recolor(colormap= 'gist_earth' , random_state=244))

In [23]:
#df_mal = df.loc[:,['ID','Title','Domain']]

In [24]:
#wordcloud(df, 'Domain')

Model building

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier

tfidf = TfidfVectorizer(analyzer='word',max_features = 2000)

In [44]:
x = tfidf.fit_transform(df.Title)

In [45]:
x.shape

(3834, 2000)

In [46]:
y = df.Domain
print(y.value_counts())
from sklearn.preprocessing import LabelEncoder
# Create a label encoder object
label_encoder = LabelEncoder()

# Fit the encoder on the target column and transform it
y = label_encoder.fit_transform(y)

Techniques    1852
Tools          917
Career         437
Hackathons     262
Resources      170
Other          122
Misc            74
Name: Domain, dtype: int64


In [47]:
np.unique(y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]),
 array([ 437,  262,   74,  122,  170, 1852,  917]))

In [48]:
x

<3834x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 17957 stored elements in Compressed Sparse Row format>

In [49]:
y

array([5, 5, 5, ..., 6, 5, 5])

In [50]:
tfidf.vocabulary_

{'good': 677,
 'decision': 427,
 'tree': 1792,
 'large': 846,
 'small': 1538,
 'size': 1527,
 'train': 1764,
 'data': 405,
 'contain': 341,
 'single': 1523,
 'positive': 1168,
 'label': 841,
 'calculate': 222,
 'percentage': 1112,
 'negative': 1016,
 'component': 318,
 'unable': 1824,
 'open': 1062,
 'solution': 1548,
 'checker': 269,
 'user': 1863,
 'name': 1006,
 'change': 260,
 'decide': 425,
 'curve': 389,
 'fit': 621,
 'model': 970,
 'epochs': 546,
 'neural': 1022,
 'network': 1021,
 'chatbot': 264,
 'give': 672,
 'answer': 107,
 'ames': 59,
 'repository': 1369,
 'plot': 1135,
 'google': 679,
 'map': 919,
 'python': 1300,
 'algorithm': 33,
 'choose': 274,
 'response': 1387,
 'weak': 1914,
 'code': 295,
 'throw': 1735,
 'error': 550,
 'trial': 1795,
 'nonnumeric': 1037,
 'argument': 121,
 'binary': 192,
 'operator': 1065,
 'call': 225,
 'reduce': 1342,
 'learn': 860,
 'path': 1104,
 'pick': 1122,
 'become': 174,
 'scientist': 1449,
 'build': 214,
 'computer': 323,
 'vision': 1895,


In [51]:
#lets check the shape of x and y
x.shape, y.shape

((3834, 2000), (3834,))

## **Spliting data into train and test sets**

In [52]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state = 42)

In [53]:
#lets define different algorithms
svc = LinearSVC()
lr = LogisticRegression(solver='lbfgs')
mnb = MultinomialNB()
lgb = LGBMClassifier()
sgd = SGDClassifier()

In [54]:
#function for printing score
def print_score(y_pred,clf):
    print('classifier:',clf.__class__.__name__)
    #print("Jaccard score: {}".format(jaccard_score(y_test,y_pred,average='micro')))
    print("Accuracy score: {}".format(accuracy_score(y_test,y_pred)))
    print("f1_score: {}".format(f1_score(y_test,y_pred,average='micro')))
    print("Precision : ", precision_score(y_test,y_pred,average='micro'))
    print("Recall: {}".format(recall_score(y_test,y_pred,average='micro')))
   # print("Hamming loss: ", hamming_loss(y_test,y_pred))
    print("Confusion matrix:\n ", confusion_matrix(y_test,y_pred))
    print('========================================\n')   

In [55]:
#models with evaluation using OneVsRestClassifier
for classifier in [svc,lr,mnb,sgd,lgb]:
   classifier.fit(x_train,y_train)
   y_pred = classifier.predict(x_test)
   print_score(y_pred, classifier)

classifier: LinearSVC
Accuracy score: 0.6548488008342023
f1_score: 0.6548488008342023
Precision :  0.6548488008342023
Recall: 0.6548488008342023
Confusion matrix:
  [[ 93   0   0   1   3  15   5]
 [  1  27   0   0   0  30   3]
 [  3   0   0   1   1  11   3]
 [  2   0   0   1   5  23   3]
 [  8   1   0   4   6  22   5]
 [  7  10   3   1   2 382  40]
 [  4   0   1   0   3 110 119]]

classifier: LogisticRegression
Accuracy score: 0.6475495307612096
f1_score: 0.6475495307612096
Precision :  0.6475495307612096
Recall: 0.6475495307612096
Confusion matrix:
  [[ 81   0   0   0   1  32   3]
 [  0  21   0   0   0  40   0]
 [  4   0   0   0   0  13   2]
 [  2   0   0   0   1  28   3]
 [  9   1   0   0   2  32   2]
 [  4   5   0   0   0 403  33]
 [  2   0   0   0   0 121 114]]

classifier: MultinomialNB
Accuracy score: 0.6517205422314911
f1_score: 0.6517205422314911
Precision :  0.6517205422314911
Recall: 0.6517205422314911
Confusion matrix:
  [[ 92   0   0   0   0  21   4]
 [  2  14   0   0   0  

Hyperparameter Tuning

In [56]:
param = {
        'penalty': ['l1'],
        'loss': ['hinge','squared_hinge'],
        'multi_class': ['ovr','crammer_singer'],
        'dual': [False],
        'intercept_scaling': [2,4,5],
        'C': [2]
        }

In [57]:
#train the model with given parameters using GridSearchCV
svc = LinearSVC()
GCV =  GridSearchCV(svc,param,cv = 3, verbose =2,n_jobs=-1)
GCV.fit(x_train,y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


GridSearchCV(cv=3, estimator=LinearSVC(), n_jobs=-1,
             param_grid={'C': [2], 'dual': [False],
                         'intercept_scaling': [2, 4, 5],
                         'loss': ['hinge', 'squared_hinge'],
                         'multi_class': ['ovr', 'crammer_singer'],
                         'penalty': ['l1']},
             verbose=2)

In [58]:
GCV.best_params_

{'C': 2,
 'dual': False,
 'intercept_scaling': 2,
 'loss': 'squared_hinge',
 'multi_class': 'ovr',
 'penalty': 'l1'}

# Final model

In [59]:
model = LinearSVC(C=2,dual = False, loss='hinge',multi_class='crammer_singer', penalty ='l1',intercept_scaling=2)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

print("Jaccard score: {}".format(jaccard_score(y_test,y_pred,average='micro')))
print("Accuracy score: {}".format(accuracy_score(y_test,y_pred)))
print("f1_score: {}".format(f1_score(y_test,y_pred,average='micro')))
print("Precision : ", precision_score(y_test,y_pred,average='micro'))
print("Recall: {}".format(recall_score(y_test,y_pred,average='micro')))
#print("Hamming loss: ", hamming_loss(y_test,y_pred))
print("\nConfusion matrix: \n", confusion_matrix(y_test,y_pred))

Jaccard score: 0.4708588957055215
Accuracy score: 0.6402502606882169
f1_score: 0.6402502606882169
Precision :  0.6402502606882169
Recall: 0.6402502606882169

Confusion matrix: 
 [[ 95   0   0   1   3  14   4]
 [  1  27   0   0   3  26   4]
 [  2   0   1   2   1  10   3]
 [  3   1   0   2   6  20   2]
 [  9   1   0   6   8  18   4]
 [ 11  13   3   6   6 365  41]
 [  5   0   1   1   5 109 116]]


In [60]:
model.predict(x_test)

array([0, 5, 6, 5, 0, 5, 5, 5, 6, 5, 5, 5, 5, 5, 6, 5, 2, 3, 5, 5, 5, 1,
       5, 5, 1, 6, 0, 6, 5, 5, 6, 0, 5, 5, 5, 6, 6, 1, 5, 5, 5, 5, 5, 6,
       5, 0, 6, 5, 0, 5, 0, 5, 5, 5, 5, 0, 5, 5, 4, 6, 5, 5, 6, 5, 5, 5,
       6, 5, 5, 5, 5, 0, 5, 5, 5, 6, 0, 6, 5, 6, 6, 6, 6, 0, 5, 6, 5, 5,
       0, 6, 5, 5, 0, 5, 5, 6, 0, 1, 5, 5, 5, 0, 6, 5, 5, 1, 5, 5, 0, 0,
       6, 5, 5, 5, 6, 1, 5, 0, 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, 0, 5, 6, 5,
       5, 5, 5, 5, 2, 5, 5, 5, 0, 3, 4, 6, 1, 0, 5, 0, 5, 5, 0, 1, 5, 1,
       0, 0, 0, 5, 2, 6, 0, 5, 5, 5, 5, 5, 0, 5, 5, 4, 5, 0, 1, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 6, 6, 1, 1, 5, 4, 6, 0, 6, 5, 0,
       6, 5, 5, 5, 5, 5, 4, 0, 5, 0, 6, 5, 5, 3, 0, 5, 0, 4, 5, 5, 1, 5,
       5, 6, 5, 0, 5, 5, 6, 3, 6, 5, 5, 5, 5, 0, 5, 4, 5, 5, 5, 5, 0, 6,
       4, 5, 5, 5, 1, 5, 5, 5, 0, 6, 5, 5, 4, 1, 5, 5, 0, 5, 5, 6, 5, 0,
       5, 6, 0, 6, 6, 5, 5, 6, 5, 0, 6, 6, 5, 0, 6, 4, 6, 5, 5, 0, 5, 5,
       5, 5, 5, 5, 3, 5, 3, 3, 5, 6, 6, 5, 5, 6, 5,

# Prediction for test dataset using final model

In [61]:
#Lets load the test data set
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,ID,Title
0,3846,Spark Cheatsheet
1,3847,Review of Random forest code in Python
2,3848,Chi-Sq Test for Numeric variables
3,3849,Prediction from loaded pickled file for single...
4,3850,"Even after installing Anaconda on my PC, I am ..."


In [62]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1649 entries, 0 to 1648
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      1649 non-null   int64 
 1   Title   1649 non-null   object
dtypes: int64(1), object(1)
memory usage: 25.9+ KB


In [63]:
test.isna().sum()

ID       0
Title    0
dtype: int64

In [64]:

text_proc(test)

In [68]:
tfidf = TfidfVectorizer(analyzer='word',max_features = 2000)
x1 = tfidf.fit_transform(test.Title)

In [69]:
x1

<1649x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 8125 stored elements in Compressed Sparse Row format>

In [70]:
x1.shape

(1649, 2000)

In [71]:
#lets predict the output
model.predict(x1)

array([5, 5, 6, ..., 5, 5, 5])

In [72]:

predictions = model.predict(x1)

In [73]:
pred=pd.DataFrame(predictions, columns = ['Target'])
pred

Unnamed: 0,Target
0,5
1,5
2,6
3,5
4,1
...,...
1644,4
1645,6
1646,5
1647,5


In [74]:
x_test = pd.concat([test,pred], axis=1)
x_test

Unnamed: 0,ID,Title,Target
0,3846,spark cheatsheet,5
1,3847,review random forest code python,5
2,3848,chisq test numeric variables,6
3,3849,prediction load pickle file single instance input,5
4,3850,even instal anaconda pc unable access jupyter ...,1
...,...,...,...
1644,5490,take variables train data randomforest model,4
1645,5491,ti ame series forecast reduce stationary series,6
1646,5492,data visualization text analysis twitter mine,5
1647,5493,cross validation strategy stack model,5


In [75]:
x_test.Target.value_counts()

5    959
6    332
0    112
4     84
1     68
2     56
3     38
Name: Target, dtype: int64

In [76]:
x_test.dtypes

ID         int64
Title     object
Target     int64
dtype: object

In [77]:
x_test['Domain'] = x_test['Target'].replace({0: 'Career' ,
                                             1: 'Hackathons',
                                             2:'Misc',
                                             3: 'Other',
                                             4:'Resources',
                                             5: 'Techniques',
                                             6: 'Tools'})

In [78]:
x_test

Unnamed: 0,ID,Title,Target,Domain
0,3846,spark cheatsheet,5,Techniques
1,3847,review random forest code python,5,Techniques
2,3848,chisq test numeric variables,6,Tools
3,3849,prediction load pickle file single instance input,5,Techniques
4,3850,even instal anaconda pc unable access jupyter ...,1,Hackathons
...,...,...,...,...
1644,5490,take variables train data randomforest model,4,Resources
1645,5491,ti ame series forecast reduce stationary series,6,Tools
1646,5492,data visualization text analysis twitter mine,5,Techniques
1647,5493,cross validation strategy stack model,5,Techniques


In [81]:
 x_test.drop(columns = ['Title','Target'],inplace = True)

In [82]:
x_test

Unnamed: 0,ID,Domain
0,3846,Techniques
1,3847,Techniques
2,3848,Tools
3,3849,Techniques
4,3850,Hackathons
...,...,...
1644,5490,Resources
1645,5491,Tools
1646,5492,Techniques
1647,5493,Techniques


In [83]:
#saving the data into csv file
x_test.to_csv(r"solution_submission.csv")

In [84]:
x_test

Unnamed: 0,ID,Domain
0,3846,Techniques
1,3847,Techniques
2,3848,Tools
3,3849,Techniques
4,3850,Hackathons
...,...,...
1644,5490,Resources
1645,5491,Tools
1646,5492,Techniques
1647,5493,Techniques
