In [227]:
import numpy as np
import gensim
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import nltk
import pickle as pk
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from cleantext import clean
import re

In [228]:
total_train = pd.read_csv("/home/msc2/dbpedia/dbpedia_csv/train.csv")
total_test = pd.read_csv("/home/msc2/dbpedia/dbpedia_csv/test.csv")

#### Took only about 1% data from the given training data set, due to slow system execution time

In [287]:
x_train1,x_test1,y_train1,y_test1 = sklearn.model_selection.train_test_split(total_train["Content"],total_train["Class"],test_size = 0.99)

In [288]:
train = x_train1 

In [289]:
train_target = y_train1

In [290]:
train_target

532021    14
248348     7
128468     4
69578      2
471512    12
          ..
523464    14
179872     5
131933     4
219110     6
455287    12
Name: Class, Length: 5600, dtype: int64

In [291]:
train.shape

(5600,)

#### Have opted for tf-idf representation of the plain text

In [304]:
tfidf = TfidfVectorizer(stop_words= 'english', sublinear_tf= True)

#### The below statement learns the vocabulary of the given training dataset

In [305]:
tfidf_fitted = tfidf.fit(train)

#### The below statement returns a tf-idf scores of the entire vocabulary

In [306]:
tfidf_scores = tfidf_fitted.transform(train)

In [307]:
tfidf_scores

<5600x32408 sparse matrix of type '<class 'numpy.float64'>'
	with 136949 stored elements in Compressed Sparse Row format>

# Classification

## Random Forest

#### All the chosen models have been trained on the tf-idf scores obtained on the training data set and are tested on tf-idf of test data consisting of 1000 datapoints

In [315]:
temp = total_test.sample(n=1000, random_state=1)
test = temp["Content"]
test_target = temp["Class"]

In [316]:
clf_rf = RandomForestClassifier(max_depth=5,n_estimators= 100)
clf_rf.fit(tfidf_scores,train_target)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [317]:
test = tfidf_fitted.transform(test)

In [318]:
clf_rf.score(test,test_target)

0.787

## Linear SVM

In [319]:
clf_svm  = LinearSVC()

In [320]:
clf_svm.fit(tfidf_scores,train_target)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [321]:
clf_svm.score(test,test_target)

0.961

#### Observation:  Since a Linear classifier such as SVM is performing better than the Random Forests which are prominently non-linear it may be the case that the considered data is linearly classifiable  