## Text classification with LinearSVC model

In [1]:
import pandas as pd
import numpy as np

### Loading the data

In [2]:
train_data=pd.read_csv('sentiment_dataset_train.csv')
dev_data=pd.read_csv('sentiment_dataset_dev.csv') 
test_data=pd.read_csv('sentiment_dataset_test.csv') 
train_data.head()

Unnamed: 0,id,review,rating
0,0,Arrived about 10pm and check in was painless. ...,4
1,1,I checked in at 4pm even tough room was not re...,2
2,2,"I chose this hotel, as it was in a good locati...",2
3,3,"Great location, super close to shops & a 10min...",4
4,4,I was in the Sir Adam Hotel to visit a friend....,3


### data splitting

In [3]:
X_train = train_data['review']
y_train = train_data['rating'].astype(str)
X_dev = dev_data['review']
y_dev = dev_data['rating'].astype(str)

### Build pipelines to vectorize the data, then train and fit a model

Using "LinearSVC" model in here. Buiding a pipline for text classification based on this model.

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

### Feed the training data through the pipeline

In [5]:
text_clf_lsvc.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

### Run predictions and analyze the results (Linear SVC)

In [6]:
# Form a prediction set
predictions = text_clf_lsvc.predict(X_dev)

In [7]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_dev,predictions))

[[1279  172   49   16    7]
 [ 262 1006  191   41    7]
 [  64  172 1117  109   21]
 [  15   50  155 1086  194]
 [   6   11   39  240 1190]]


In [8]:
# Print a classification report
print(metrics.classification_report(y_dev,predictions))

              precision    recall  f1-score   support

           1       0.79      0.84      0.81      1523
           2       0.71      0.67      0.69      1507
           3       0.72      0.75      0.74      1483
           4       0.73      0.72      0.73      1500
           5       0.84      0.80      0.82      1486

    accuracy                           0.76      7499
   macro avg       0.76      0.76      0.76      7499
weighted avg       0.76      0.76      0.76      7499



In [9]:
# Print the overall accuracy
print(metrics.accuracy_score(y_dev,predictions))

0.7571676223496466


#### We can improve the accuracy by tunning hyperparameters

In [10]:
# Linear SVC with hyperparameters:
text_clf_lsvch = Pipeline([('tfidf', TfidfVectorizer(stop_words=None)),
                     ('clf', LinearSVC(C=0.4)),
])

In [11]:
text_clf_lsvch.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC(C=0.4))])

In [12]:
# Form a prediction set
predictions = text_clf_lsvch.predict(X_dev)

In [13]:
print(metrics.confusion_matrix(y_dev,predictions))

[[1294  166   48    8    7]
 [ 278 1010  184   28    7]
 [  69  168 1125   98   23]
 [  17   44  157 1103  179]
 [   5   10   36  239 1196]]


In [14]:
# Print a classification report
print(metrics.classification_report(y_dev,predictions))

              precision    recall  f1-score   support

           1       0.78      0.85      0.81      1523
           2       0.72      0.67      0.70      1507
           3       0.73      0.76      0.74      1483
           4       0.75      0.74      0.74      1500
           5       0.85      0.80      0.83      1486

    accuracy                           0.76      7499
   macro avg       0.76      0.76      0.76      7499
weighted avg       0.76      0.76      0.76      7499



In [15]:
# Print the overall accuracy
print(metrics.accuracy_score(y_dev,predictions))

0.7638351780237365


### Using gridsearch to see if we can get a better result

In [16]:
from sklearn.model_selection import GridSearchCV

In [30]:
# Gridsearch to determine the value of C
param_grid = {'clf__C':np.arange(0.01,20,2)}
linearSVC = GridSearchCV(text_clf_lsvc,param_grid,cv=5,return_train_score=True)
linearSVC.fit(X_train,y_train)
print(linearSVC.best_params_)



{'clf__C': 2.01}


In [32]:
linearSVC.best_estimator_

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC(C=2.01))])

In [33]:
bestlinearSVC.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC(C=5.01))])

In [34]:
predictions = text_clf_lsvch.predict(X_dev)

In [35]:
print(metrics.confusion_matrix(y_dev,predictions))

[[1294  166   48    8    7]
 [ 278 1010  184   28    7]
 [  69  168 1125   98   23]
 [  17   44  157 1103  179]
 [   5   10   36  239 1196]]


In [36]:
print(metrics.classification_report(y_dev,predictions))

              precision    recall  f1-score   support

           1       0.78      0.85      0.81      1523
           2       0.72      0.67      0.70      1507
           3       0.73      0.76      0.74      1483
           4       0.75      0.74      0.74      1500
           5       0.85      0.80      0.83      1486

    accuracy                           0.76      7499
   macro avg       0.76      0.76      0.76      7499
weighted avg       0.76      0.76      0.76      7499



In [37]:
print(metrics.accuracy_score(y_dev,predictions))

0.7638351780237365


In [None]:
Again we obtained the same result. 

### The best accuracy is ~76.4% for LinearSVC model

### Predicting ratings for test results

In [39]:
text_clf_lsvch.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC(C=0.4))])

In [43]:
X_test = test_data['review']

In [44]:
predictions = text_clf_lsvch.predict(X_test)

In [45]:
predictions

array(['3', '5', '1', ..., '3', '2', '2'], dtype=object)

In [46]:
test_data.head()

Unnamed: 0,id,review
0,0,Not at all what expected. Our mountain view ...
1,1,Good location as we needed to head to Reims th...
2,3,Me and my son just returned from Broadmoor Mia...
3,4,The place was filthy and full of stoned backpa...
4,6,The hotel itself is really nice and modern whi...


In [47]:
test_data['rating']=predictions
test_data.head()

Unnamed: 0,id,review,rating
0,0,Not at all what expected. Our mountain view ...,3
1,1,Good location as we needed to head to Reims th...,5
2,3,Me and my son just returned from Broadmoor Mia...,1
3,4,The place was filthy and full of stoned backpa...,1
4,6,The hotel itself is really nice and modern whi...,4


### Saving predictions for test data to an excel file

In [49]:
test_data.to_excel("predictions.xlsx",index=False) 