# Assignment on Text Classification

### Importing Modules

In [1]:
import glob, os, gdown, time, re, joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn import metrics
import pandas as pd
import numpy as np

### Loading Dataset

In [2]:
!gdown --id 1DKicn9YfqsK6k5fJFOHgclZA6HkSW1Uq
!unzip songData.zip

Downloading...
From: https://drive.google.com/uc?id=1DKicn9YfqsK6k5fJFOHgclZA6HkSW1Uq
To: /content/songData.zip
  0% 0.00/1.16M [00:00<?, ?B/s]100% 1.16M/1.16M [00:00<00:00, 18.2MB/s]
Archive:  songData.zip
  inflating: songData/songType0.xlsx  
  inflating: songData/songType1.xlsx  
  inflating: songData/songType10.xlsx  
  inflating: songData/songType11.xlsx  
  inflating: songData/songType12.xlsx  
  inflating: songData/songType13.xlsx  
  inflating: songData/songType14.xlsx  
  inflating: songData/songType15.xlsx  
  inflating: songData/songType16.xlsx  
  inflating: songData/songType17.xlsx  
  inflating: songData/songType18.xlsx  
  inflating: songData/songType19.xlsx  
  inflating: songData/songType2.xlsx  
  inflating: songData/songType20.xlsx  
  inflating: songData/songType3.xlsx  
  inflating: songData/songType4.xlsx  
  inflating: songData/songType5.xlsx  
  inflating: songData/songType6.xlsx  
  inflating: songData/songType7.xlsx  
  inflating: songData/songType8.xlsx  


### Preprocessing Data for Training

In [3]:
excel_files = glob.glob("/content/songData/*.xlsx")
len(excel_files)

21

In [4]:
names = []
count = []
clusters = []
data = []

for each in excel_files:
  temp = pd.read_excel(each)
  count.append(len(temp))
  names.append(each.split('/')[1][:-5])
  clusters.append(temp['songType'])
  data.append(temp)

cluster_ids = clusters

In [5]:
data = pd.concat(data)
data = data.sample(frac=1)

data

Unnamed: 0.1,Unnamed: 0,lyrics,songType
296,1477,রাত্রি অনেক হল চোখে নেই কোন ঘুম অপরুপ জোছনায় অ...,13
4,271,আমার ইচ্ছে করে আকাশ বাড়ির ছাদ ভেঙে বৃষ্টি আসুক...,5
688,3177,"প্রভু, খেলেছি অনেক খেলা এবে তোমার ক্রোড় চাহি।...",17
305,946,"বেলা শেষে ক্লান্ত পাখি ফিরে যায় নীড়ে, হৃদয়ে বি...",1
773,3852,আমি কান পেতে রই। ও আমার আপন হৃদয়গহন দ্বারে বা...,17
...,...,...,...
629,2324,নিত্য তোমার যে ফুল ফোটে ফুলবনে তারি মধু কেন মন...,17
51,306,আমাদের দেশটা স্বপ্নপুরী সাথী মোদের ফুলপরী ফুলপ...,1
357,1797,তোমার পূজার ছলে তোমায় ভুলেই থাকি। বুঝতে নারি ক...,17
15,3645,দুই ভুবনের দুই বাসিন্দা বন্ধু চিরকাল রেললাইন ব...,8


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(data['lyrics'], data['songType'], test_size=0.2, random_state=2)

### Text Classification using KNN

In [7]:
text_clf = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier(n_neighbors=9))
])

classifier = text_clf.fit(list(X_train), list(Y_train))

cv = ShuffleSplit(n_splits=7, test_size=0.2, random_state=2)
score = cross_val_score(text_clf, list(X_train), list(Y_train), cv=cv)

print("Cross Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

predicted = text_clf.predict(X_test)

Cross Accuracy: 0.34 (+/- 0.03)


In [8]:
joblib.dump(classifier, '/content/knn_text_classifier.pickle')

['/content/knn_text_classifier.pickle']

In [9]:
print("Cross Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
print(metrics.classification_report(Y_test, predicted, zero_division=0))

Cross Accuracy: 0.34 (+/- 0.03)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.29      0.52      0.37       201
           2       0.00      0.00      0.00         2
           3       0.24      0.31      0.27       154
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00        11
           6       0.67      0.22      0.33        18
           7       0.27      0.14      0.19        28
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         9
          10       0.00      0.00      0.00         7
          11       0.00      0.00      0.00        14
          12       0.33      0.04      0.07        25
          13       0.29      0.24      0.26       143
          14       0.67      0.22      0.33         9
          16       0.00      0.00      0.00         1
          17       0.70      0.38      0.49      

### Text Classification using Naive Bayes

In [10]:
text_clf = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())
])

classifier = text_clf.fit(list(X_train), list(Y_train))

cv = ShuffleSplit(n_splits=7, test_size=0.2, random_state=2)
score = cross_val_score(text_clf, list(X_train), list(Y_train), cv=cv)

print("Cross Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

predicted = text_clf.predict(X_test)

Cross Accuracy: 0.39 (+/- 0.05)


In [11]:
joblib.dump(classifier, '/content/naive_bayes_text_classifier.pickle')

['/content/naive_bayes_text_classifier.pickle']

In [12]:
print("Cross Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
print(metrics.classification_report(Y_test, predicted, zero_division=0))

Cross Accuracy: 0.39 (+/- 0.05)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.31      0.93      0.47       201
           2       0.00      0.00      0.00         2
           3       0.40      0.14      0.20       154
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00        18
           7       0.00      0.00      0.00        28
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         9
          10       0.00      0.00      0.00         7
          11       0.00      0.00      0.00        14
          12       0.00      0.00      0.00        25
          13       0.39      0.06      0.11       143
          14       0.00      0.00      0.00         9
          16       0.00      0.00      0.00         1
          17       0.77      0.63      0.69      

### Text Classification using Random Forest

In [13]:
text_clf = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=50, max_depth=21, random_state=2))
])

classifier = text_clf.fit(list(X_train), list(Y_train))

cv = ShuffleSplit(n_splits=7, test_size=0.2, random_state=2)
score = cross_val_score(text_clf, list(X_train), list(Y_train), cv=cv)

print("Cross Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

predicted = text_clf.predict(X_test)

Cross Accuracy: 0.41 (+/- 0.02)


In [14]:
joblib.dump(classifier, '/content/random_forest_text_classifier.pickle')

['/content/random_forest_text_classifier.pickle']

In [15]:
print("Cross Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
print(metrics.classification_report(Y_test, predicted, zero_division=0))

Cross Accuracy: 0.41 (+/- 0.02)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.33      0.80      0.46       201
           2       0.00      0.00      0.00         2
           3       0.37      0.18      0.24       154
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00        18
           7       1.00      0.14      0.25        28
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         9
          10       0.00      0.00      0.00         7
          11       0.00      0.00      0.00        14
          12       0.00      0.00      0.00        25
          13       0.55      0.19      0.28       143
          14       1.00      0.11      0.20         9
          16       0.00      0.00      0.00         1
          17       0.65      0.70      0.67      

### Text Classification using Decision Tree

In [16]:
text_clf = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', DecisionTreeClassifier(criterion="entropy", max_depth=7))
])

classifier = text_clf.fit(list(X_train), list(Y_train))

cv = ShuffleSplit(n_splits=7, test_size=0.2, random_state=2)
score = cross_val_score(text_clf, list(X_train), list(Y_train), cv=cv)

print("Cross Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

predicted = text_clf.predict(X_test)

Cross Accuracy: 0.30 (+/- 0.04)


In [17]:
joblib.dump(classifier, '/content/decision_tree_text_classifier.pickle')

['/content/decision_tree_text_classifier.pickle']

In [18]:
print("Cross Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
print(metrics.classification_report(Y_test, predicted, zero_division=0))

Cross Accuracy: 0.30 (+/- 0.04)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.29      0.51      0.37       201
           2       0.00      0.00      0.00         2
           3       0.18      0.06      0.09       154
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00        18
           7       0.13      0.07      0.09        28
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         9
          10       0.00      0.00      0.00         7
          11       0.00      0.00      0.00        14
          12       0.00      0.00      0.00        25
          13       0.46      0.21      0.29       143
          14       1.00      0.11      0.20         9
          16       0.00      0.00      0.00         1
          17       0.37      0.60      0.46      

### Text Classification using ANN

In [22]:
text_clf = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MLPClassifier(solver='lbfgs'))
])

classifier = text_clf.fit(list(X_train), list(Y_train))

cv = ShuffleSplit(n_splits=7, test_size=0.2, random_state=2)
score = cross_val_score(text_clf, list(X_train), list(Y_train), cv=cv)

print("Cross Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

predicted = text_clf.predict(X_test)

Cross Accuracy: 0.42 (+/- 0.03)


In [23]:
joblib.dump(classifier, '/content/ann_text_classifier.pickle')

['/content/ann_text_classifier.pickle']

In [24]:
print("Cross Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
print(metrics.classification_report(Y_test, predicted, zero_division=0))

Cross Accuracy: 0.42 (+/- 0.03)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.37      0.38      0.37       201
           2       0.00      0.00      0.00         2
           3       0.38      0.42      0.40       154
           4       0.50      0.50      0.50         2
           5       0.25      0.18      0.21        11
           6       0.42      0.28      0.33        18
           7       0.26      0.39      0.31        28
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         9
          10       0.00      0.00      0.00         7
          11       0.13      0.14      0.14        14
          12       0.06      0.04      0.05        25
          13       0.37      0.35      0.36       143
          14       0.18      0.22      0.20         9
          16       0.00      0.00      0.00         1
          17       0.70      0.65      0.67      

## 2. The explanation behind ”The model gives very low f1 score for some classes but not the same for others”is given below :

We know that the F1/F Score is a measure of how accurate a model
is by using Precision and Recall following the formula of:

F1 Score = 2 * ((Precision * Recall) / (Precision + Recall))

Precision is commonly called positive predictive value. It is also inte-
resting to note that the PPV can be derived using Bayes’ theorem as

well.
Precision = True Positives / (True Positives + False Positives)

Recall is also known as the True Positive Rate and is defined as the
following:
Recall = True Positives / (True Positives + False Negatives)

If the precision is very low and recall value gets very high then the
F1 score will become very low.But it should become the average of
precision and recall.The alternative situation aslo behave the same.
So, In the end, We can say. Some model gives the high precision and
high recall value ,which are made the F1 score high.But if one’s score
gets very low then the F1 score also become very low.

# 3. The low f1 score issue is tried to fix in below :

If the F1-score is the figure of merit, I would try to tune the class
weights. It should be pretty easy, if we have a binary classification
problem. We can feed class weight a dictionary with the weights for
each class.
Here’s a little example.

```
clf = RandomForestClassifier()
params = {’class weight’:[{0:neg weight, 1:1} for neg weight in np.arange(1.0,
5.0, 0.5)]}
gs = GridSearchCV(estimator=clf,param_grid=params, cv=5)
gs.fit(X_train, Y_train)
```

