## 1.Reading preprocessed data

In [51]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [52]:
train=pd.read_csv("train_preprocessed.csv")

In [53]:
train.head()

Unnamed: 0.1,Unnamed: 0,Text,Tags
0,0,two cultur statist vs machin learn last year r...,['machine-learning']
1,1,forecast demograph census way forecast demogra...,['forecasting']
2,2,bayesian frequentist reason plain english woul...,['bayesian']
3,3,mean p valu valu statist test take statist cou...,"['hypothesis-testing', 't-test', 'p-value', 'i..."
4,4,exampl teach correl mean causat old say correl...,['correlation']


In [54]:
train.drop(["Unnamed: 0"], axis = 1,inplace=True)

In [55]:
train.head()

Unnamed: 0,Text,Tags
0,two cultur statist vs machin learn last year r...,['machine-learning']
1,forecast demograph census way forecast demogra...,['forecasting']
2,bayesian frequentist reason plain english woul...,['bayesian']
3,mean p valu valu statist test take statist cou...,"['hypothesis-testing', 't-test', 'p-value', 'i..."
4,exampl teach correl mean causat old say correl...,['correlation']


In [56]:
train.isnull().values.any()
train = train.dropna(how='any',axis=0)
train.isnull().values.any()

False

### Getting text and Tags

In [57]:
X=train['Text']

In [58]:
import ast
train['Tags'] = train['Tags'].apply(lambda x: ast.literal_eval(x))

In [59]:
Tags=train["Tags"]
Tags

0                                       [machine-learning]
1                                            [forecasting]
2                                               [bayesian]
3        [hypothesis-testing, t-test, p-value, interpre...
4                                            [correlation]
                               ...                        
85080    [logistic, categorical-data, interaction, inte...
85081                                       [linear-model]
85082    [machine-learning, hypothesis-testing, statist...
85083                     [hypothesis-testing, self-study]
85084                                 [hypothesis-testing]
Name: Tags, Length: 73527, dtype: object

## 2.Binarization of y label

In [60]:
multilabel=MultiLabelBinarizer()

In [61]:
y=multilabel.fit_transform(Tags)

In [65]:
multilabel.classes_

array(['anova', 'arima', 'autocorrelation', 'bayesian', 'binomial',
       'bootstrap', 'categorical-data', 'chi-squared', 'classification',
       'clustering', 'conditional-probability', 'confidence-interval',
       'correlation', 'covariance', 'cross-validation', 'data-mining',
       'data-transformation', 'data-visualization', 'dataset',
       'distributions', 'econometrics', 'estimation', 'experiment-design',
       'feature-selection', 'forecasting', 'generalized-linear-model',
       'hypothesis-testing', 'interaction', 'interpretation',
       'least-squares', 'linear-model', 'logistic', 'machine-learning',
       'mathematical-statistics', 'matlab', 'maximum-likelihood', 'mean',
       'mixed-model', 'model-selection', 'modeling',
       'multilevel-analysis', 'multiple-comparisons',
       'multiple-regression', 'multivariate-analysis', 'neural-networks',
       'nonparametric', 'normal-distribution', 'optimization', 'p-value',
       'panel-data', 'pca', 'poisson', 'predi

In [66]:
pd.DataFrame(y,columns=multilabel.classes_)

Unnamed: 0,anova,arima,autocorrelation,bayesian,binomial,bootstrap,categorical-data,chi-squared,classification,clustering,conditional-probability,confidence-interval,correlation,covariance,cross-validation,data-mining,data-transformation,data-visualization,dataset,distributions,econometrics,estimation,experiment-design,feature-selection,forecasting,generalized-linear-model,hypothesis-testing,interaction,interpretation,least-squares,linear-model,logistic,machine-learning,mathematical-statistics,matlab,maximum-likelihood,mean,mixed-model,model-selection,modeling,multilevel-analysis,multiple-comparisons,multiple-regression,multivariate-analysis,neural-networks,nonparametric,normal-distribution,optimization,p-value,panel-data,pca,poisson,prediction,predictive-models,probability,python,r,random-forest,random-variable,references,regression,repeated-measures,sample-size,sampling,self-study,simulation,spss,standard-deviation,stata,statistical-significance,survival,svm,t-test,time-series,variance
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73522,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
73523,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
73524,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
73525,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


## 3.Splitting Test and Train

In [92]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

## 4.tfidf Vectorizer 
choosing max_feature=100000<br>
max_df=0.8 <br>
min_df=3 <br>

In [93]:
tfidf=TfidfVectorizer(ngram_range=(1,2),max_features=100000,max_df=0.8,min_df=3,stop_words="english")
x_train=tfidf.fit_transform(x_train)

In [94]:
x_test=tfidf.transform(x_test)

In [95]:
len(tfidf.vocabulary_)

100000

In [97]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score,precision_score,recall_score
import warnings
warnings.filterwarnings("ignore")

## 5.Logistic Regression

### 5.1.Training logistic regression on optimal C

In [98]:
lr=LogisticRegression(solver='saga',C=20)
clf=OneVsRestClassifier(lr)
clf.fit(x_train,y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=20, solver='saga'))

### 5.2.Testing

In [99]:
predictions = clf.predict(x_test)
f1 = f1_score(y_test, predictions, average='micro')

In [101]:
f1

0.5106823356254169

In [100]:
from sklearn.metrics import jaccard_score
jaccard_score(y_test, predictions,average='micro')

0.3428968499073502

## 6.SVM using SGDClassifier

### 6.1.Training SVM on optimal alpha

In [102]:
from sklearn.linear_model import SGDClassifier
lr=SGDClassifier(loss = 'hinge', alpha =0.00001)
clf=OneVsRestClassifier(lr)
clf.fit(x_train,y_train)

OneVsRestClassifier(estimator=SGDClassifier(alpha=1e-05))

### 6.2.Testing

In [103]:
predictions = clf.predict(x_test)
f1 = f1_score(y_test, predictions, average='micro')
f1

0.5220475012831879

In [104]:
from sklearn.metrics import jaccard_score
jaccard_score(y_test, predictions,average='micro')

0.35322346403990657

|Model|F1 score|jaccard_score|
|-----|--------|-------------|
|Logistic Regression|0.510|0.342|
|SVM|0.522|0.353|