In [1]:
import numpy as np
import gensim
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import nltk
import pickle as pk
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from cleantext import clean
import re
from sklearn.metrics import roc_auc_score

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [2]:
total_train = pd.read_csv("/home/msc2/dbpedia/dbpedia_csv/train.csv")
total_test = pd.read_csv("/home/msc2/dbpedia/dbpedia_csv/test.csv")

In [60]:
total_train["Class"].value_counts()

14    40000
13    40000
12    40000
11    40000
10    40000
9     40000
8     40000
7     40000
6     40000
5     40000
4     40000
3     40000
2     40000
1     40000
Name: Class, dtype: int64

#### This shows that the entire data set is properly balanced and does not require any other preprocessing for balancing the data

In [3]:
x_train = total_train["Content"]
y_train = total_train["Class"]
x_test = total_test["Content"]
y_test = total_test["Class"]

In [4]:
x_train.shape

(560000,)

#### The following function gets the tf-idf scores of the entire vocabulary omitting the stop words

In [5]:
tfidf = TfidfVectorizer(stop_words= 'english', sublinear_tf= True)

#### The below statement learns the vocabulary of the given training dataset

In [6]:
tfidf_fitted = tfidf.fit(x_train)

#### The below statement returns a tf-idf scores of the entire vocabulary in the training data set

In [7]:
tfidf_scores = tfidf_fitted.transform(x_train)

In [8]:
tfidf_scores

<560000x669038 sparse matrix of type '<class 'numpy.float64'>'
	with 13695374 stored elements in Compressed Sparse Row format>

# Classification

## Random Forest

#### All the chosen models have been trained on the tf-idf scores obtained on the training data set and are tested on tf-idf of test data consisting of 1000 datapoints

In [9]:

for i in range(1,21):
    
    clf_rf = RandomForestClassifier(max_depth=3,n_estimators= i*5)
    #clf_rf.fit(tfidf_scores,y_train)
    print(i*5,"  " ,np.mean(cross_val_score(clf_rf,tfidf_scores,y_train)))

5    0.1651357142857143
10    0.23315357142857143
15    0.2663410714285714
20    0.3030839285714286
25    0.36097678571428576
30    0.4064660714285714
35    0.4414160714285714
40    0.4858339285714285
45    0.4854428571428572
50    0.5353714285714286
55    0.5515857142857142
60    0.5531214285714287
65    0.5934339285714285
70    0.6250214285714286
75    0.6259392857142857
80    0.6388517857142857
85    0.6142
90    0.6356839285714286
95    0.6565267857142857
100    0.6945857142857144


#### In the above cross validation i have swept from 5 trees to 100 trees and found the validation accuracy incresing along the line of increse of number of trees. All the while i have fixed the depth of trees to be 3, the default value of the package.

In [16]:

for i in range(1,10):
    
    clf_rf = RandomForestClassifier(max_depth=i,n_estimators= 100)
    #clf_rf.fit(tfidf_scores,y_train)
    print(i,"  " ,np.mean(cross_val_score(clf_rf,tfidf_scores,y_train)))

1    0.4067071428571428
2    0.6020535714285715
3    0.6796821428571429
4    0.7212928571428572
5    0.7548785714285714
6    0.7999642857142857
7    0.8243750000000001
8    0.8301589285714286
9    0.8473267857142857


#### In the above cross validation done for finding the best value for the depth of trees it is clearly seen that the validation accuracy keeps increasing with the increase of number of trees.

#### Finally i have taken 200 chosen trees with with depth 15.

In [45]:
clf_rf = RandomForestClassifier(max_depth= 15,n_estimators= 200)
clf_rf.fit(tfidf_scores,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [46]:
test = tfidf_fitted.transform(x_test)

In [47]:
clf_rf.score(test,y_test)

0.9076

In [48]:
predicted = clf_rf.predict(test)

In [49]:
score_prob = clf_rf.predict_proba(test)

#### Have used the roc_auc score to evaluate the models performance 

In [50]:
roc_auc_score(y_true= y_test,y_score= score_prob,multi_class= 'ovo' )

0.991612305934066

## Linear SVM

In [51]:
clf_svm  = LinearSVC()

In [52]:
clf_svm.fit(tfidf_scores,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [53]:
clf_svm.score(test,y_test)

0.9811142857142857

#### Observation:  Since a Linear classifier such as SVM is performing better than the Random Forests which are prominently non-linear it may be the case that the considered data is linearly classifiable  

In [54]:
predicted = clf_svm.predict(test)

#### Since probability values are not available for the SVM model could not get the roc_auc score for it. Though its possible to get the probability values by using paltt's scaling; will be working on it post this submission