In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazon-mobile-phone-reviews-dataset/SRPtest.csv
/kaggle/input/amazon-mobile-phone-reviews-dataset/SRPtrain.csv
/kaggle/input/starratingprediction-part-2/__results__.html
/kaggle/input/starratingprediction-part-2/__resultx__.html
/kaggle/input/starratingprediction-part-2/__notebook__.ipynb
/kaggle/input/starratingprediction-part-2/__output__.json
/kaggle/input/starratingprediction-part-2/custom.css
/kaggle/input/starratingprediction-part-2/models/model_uni75.sav
/kaggle/input/starratingprediction-part-2/models/vectorizer_uni25.sav
/kaggle/input/starratingprediction-part-2/models/model_tri25.sav
/kaggle/input/starratingprediction-part-2/models/model_uni50.sav
/kaggle/input/starratingprediction-part-2/models/vectorizer_uni50.sav
/kaggle/input/starratingprediction-part-2/models/vectorizer_bi25.sav
/kaggle/input/starratingprediction-part-2/models/vectorizer_uni75.sav
/kaggle/input/starratingprediction-part-2/models/vectorizer_tri25.sav
/kaggle/input/starratingprediction-part-2

# **Star Rating prediction (Part-3)**

In this notebook, we are going to be making predictions using **Random Forest Algorithm** and comparing the results with Multinomial Naive Bayes done in <a href="https://www.kaggle.com/rajatagg/starratingprediction-part-2">StarRatingPrediction(Part-2)</a>.

Take a look at the previous parts:
* <a href="https://www.kaggle.com/rajatagg/starratingprediction-part-1">Part-1 : Data Preprocessing </a>
* <a href="https://www.kaggle.com/rajatagg/starratingprediction-part-2">Part-2 : Feature Vector Generation & Prediction Using Multinomial Naive Bayes Algorithm with Unigram, Bigram and Trigrams

# Loading the dataset

We will use the preprocessed training data from Part-1 to avoid preprocessing the <a href="https://www.kaggle.com/PromptCloudHQ/amazon-reviews-unlocked-mobile-phones">dataset</a> again.

In [3]:
X_train_df = pd.read_csv('/kaggle/input/amazon-mobile-phone-reviews-dataset/SRPtrain.csv')

In [4]:
X_train_df.head()

Unnamed: 0,Reviews,Rating
0,nice practical basic but not cheap very resist...,4
1,grabbed function samsung note took off new hei...,5
2,day love bought neon orange people love color ...,5
3,person trash sent empty box without equipment ...,1
4,wanted good promising but kept turning off res...,1


In [5]:
#  its a python script containing functions used in preprocessing
from preprocessing_text import *

/kaggle/input/amazon-mobile-phone-reviews-dataset/SRPtest.csv
/kaggle/input/amazon-mobile-phone-reviews-dataset/SRPtrain.csv
/kaggle/input/starratingprediction-part-2/__results__.html
/kaggle/input/starratingprediction-part-2/__resultx__.html
/kaggle/input/starratingprediction-part-2/__notebook__.ipynb
/kaggle/input/starratingprediction-part-2/__output__.json
/kaggle/input/starratingprediction-part-2/custom.css
/kaggle/input/starratingprediction-part-2/models/model_uni75.sav
/kaggle/input/starratingprediction-part-2/models/vectorizer_uni25.sav
/kaggle/input/starratingprediction-part-2/models/model_tri25.sav
/kaggle/input/starratingprediction-part-2/models/model_uni50.sav
/kaggle/input/starratingprediction-part-2/models/vectorizer_uni50.sav
/kaggle/input/starratingprediction-part-2/models/vectorizer_bi25.sav
/kaggle/input/starratingprediction-part-2/models/vectorizer_uni75.sav
/kaggle/input/starratingprediction-part-2/models/vectorizer_tri25.sav
/kaggle/input/starratingprediction-part-2

In [6]:
X_train_df['review_length'] = np.array(X_train_df['Reviews'].apply(getReviewLength))
X_train_df['word_count'] = np.array(X_train_df['Reviews'].apply(getWordCount))

In [7]:
X_train_df.head()

Unnamed: 0,Reviews,Rating,review_length,word_count
0,nice practical basic but not cheap very resist...,4,66,13
1,grabbed function samsung note took off new hei...,5,144,29
2,day love bought neon orange people love color ...,5,59,12
3,person trash sent empty box without equipment ...,1,47,8
4,wanted good promising but kept turning off res...,1,122,22


# **1.1 Unigrams with review length < 50**

In [8]:
#Using sk-learn implementation for tf-idf vectorization

from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
#dropping the reviews having review.length >= 50
index_names = X_train_df[X_train_df['review_length'] >= 50].index
X_train_df.drop(index_names, inplace = True)

> **fit_transform()** :Learn vocabulary and idf. The input is an iterable which can be string objects and returns Tf-idf-weighted document-term matrix.

In [10]:
# applying fit_transform to generate document term matrix

vectorizer_uni75 = TfidfVectorizer(ngram_range=(1,1))
vectorized_uni = vectorizer_uni.fit_transform(X_train_df['Reviews'])
X_unigram = vectorized_uni.toarray()

> below shown is the matrix in which each row represents a feature vector corresponding to a review.

In [11]:
X_unigram

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

> It is just a sparse representation, all the entries are not zero.

In [12]:
print("The length of the feature vector is :",X_unigram.shape[1])

The length of the feature vector is : 9804


In [13]:
vectorizer_uni.get_feature_names()[100:130]

['activado',
 'activate',
 'activated',
 'activatible',
 'activating',
 'activation',
 'active',
 'actived',
 'activiate',
 'activity',
 'actual',
 'actualization',
 'actually',
 'actuly',
 'acurate',
 'ad',
 'ada',
 'adapt',
 'adaptation',
 'adaptators',
 'adapted',
 'adapter',
 'adaptive',
 'adaptor',
 'add',
 'added',
 'addicted',
 'addiction',
 'addictive',
 'adding']

> Now, that we have generated the feature vector let's train a model on training data using Random Forest Classifier

> **Importing Random Forest Classifier**



In [14]:
from sklearn.ensemble import RandomForestClassifier

> A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. 

**Function Parameters:**
* **n_estimators**: The number of trees in the forest. We will take 5.
* **criterion**: The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.

In [15]:
n_estimators = 10

In [16]:
model_rf=RandomForestClassifier(n_estimators =n_estimators,criterion="entropy",random_state =0)

> Training the classifier

In [17]:
model_rf.fit(X_unigram, X_train_df['Rating'])

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

> making predictions

In [18]:
y_pred = model_rf.predict(X_unigram)

> Importing the functions to obtain the performance metrics

In [19]:
from sklearn.metrics import accuracy_score, classification_report

In [20]:
target_names = ['1', '2', '3', '4', '5']
classify_report_1 = classification_report(X_train_df['Rating'], y_pred, target_names=target_names)
print(classify_report_1)
print("Classfication Report: Unigrams & review.length < \n\n")

              precision    recall  f1-score   support

           1       0.96      0.98      0.97     18192
           2       0.97      0.88      0.92      5248
           3       0.96      0.81      0.88      7952
           4       0.96      0.63      0.76     18968
           5       0.92      0.99      0.96     96051

    accuracy                           0.93    146411
   macro avg       0.95      0.86      0.90    146411
weighted avg       0.93      0.93      0.93    146411

Classfication Report: Unigrams & review.length < 




# **2.1 Unigrams with review length < 25**

In [21]:
# review.length < 25 i.e 
index_names = X_train_df[X_train_df['review_length'] > 25].index
X_train_df.drop(index_names, inplace = True)

In [22]:
# applying fit_transform to generate document term matrix

vectorizer_uni3 = TfidfVectorizer(ngram_range=(1,1))
vectorized_uni3 = vectorizer_uni3.fit_transform(X_train_df['Reviews'])
X_unigram3 = vectorized_uni3.toarray()

In [23]:
print("The length of the feature vector is :",X_unigram3.shape[1])

The length of the feature vector is : 5056


In [24]:
model_rf2=RandomForestClassifier(n_estimators =n_estimators,criterion="entropy",random_state =0)

In [25]:
model_rf2.fit(X_unigram3, X_train_df['Rating'])

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [26]:
y_pred = model_rf2.predict(X_unigram3)

In [27]:
target_names = ['1', '2', '3', '4', '5']
classify_report_2 = classification_report(X_train_df['Rating'], y_pred, target_names=target_names)
print(classify_report_2)
print("Classfication Report: Unigrams & review.length < 25\n\n")

              precision    recall  f1-score   support

           1       0.94      0.97      0.95      9211
           2       0.92      0.77      0.84      2594
           3       0.92      0.65      0.76      4270
           4       0.91      0.43      0.59     12275
           5       0.89      0.99      0.94     70450

    accuracy                           0.90     98800
   macro avg       0.92      0.76      0.82     98800
weighted avg       0.90      0.90      0.89     98800

Classfication Report: Unigrams & review.length < 25




# **Performance of Random Forest Classifier(Training Data)**

   <table>
  <tr>
    <th>(Ngram, Review Length(less than))</th>
    <th>Accuracy</th>
    <th>Precision</th>
    <th>Recall</th>
    <th>F1-Score</th>
  </tr>
  <tr>
    <td>Unigram, 50</td>   
      <td>0.93</td>
    <td>0.93</td>
    <td>0.93</td>
      <td>0.93</td>
  </tr>
  <tr>
    <td>Unigram, 25</td>
    <td>0.90</td>
    <td>0.90</td>
    <td>0.90</td>
    <td>0.89</td>
  </tr>
</table>

> * For greater review lengths, the classifier was consuming a lot of memory and time.
> * Bigram, trigram even for smaller lengths was resulting in large no. of features resulting in the same.

# **Let's test the models on the test Data**

In [28]:
X_test = pd.read_csv('/kaggle/input/amazon-mobile-phone-reviews-dataset/SRPtest.csv')

In [29]:
X_test.head()

Unnamed: 0,Reviews,Rating
0,I've been a long time fan of lg's phones and l...,5
1,"Great Phone for the price, very quickly. The c...",5
2,Bye bye iPhone. After much research and many q...,5
3,Allí was ver y fine,5
4,"The ringer volume is poor, It needs beefing up",4


> Just to save time from pre-processing again and again the test data, I have done it once here

In [30]:
X_test['Reviews'] = X_test['Reviews'].apply(preprocess_review)

In [31]:
# function to get the star rating for a review

def getStarRating(vectorizer, model, review):
    # preprocessed_review = preprocess_review(review)
    features = vectorizer.transform([review]) # if already preprocessed
    feature_vector = features.toarray()
    predicted_class = model.predict(feature_vector)
    return predicted_class

In [32]:
review = "not a much great product"
getStarRating(vectorizer_uni3, model_rf2,review)

array([5])

# **For 1.1 (Unigram,50)**

> generating the feature vector using the vectorizer initialized already

In [33]:
vectorized_uni = vectorizer_uni.transform(X_test['Reviews'])
X_test_U50 = vectorized_uni.toarray()

In [34]:
y_pred_U50 = model_rf.predict(X_test_U50)

In [35]:
target_names = ['1', '2', '3', '4', '5']
classify_report_3 = classification_report(X_test['Rating'], y_pred_U50, target_names=target_names)
print(classify_report_3)
print("Classfication Report: Unigrams & review.length < 50\n\n")

              precision    recall  f1-score   support

           1       0.50      0.87      0.64     21701
           2       0.51      0.21      0.29      7417
           3       0.47      0.27      0.34      9529
           4       0.56      0.28      0.37     18412
           5       0.82      0.85      0.83     67075

    accuracy                           0.68    124134
   macro avg       0.57      0.49      0.50    124134
weighted avg       0.68      0.68      0.66    124134

Classfication Report: Unigrams & review.length < 50




# **For 1.2 (Unigram,25)**

In [36]:
vectorized_uni3 = vectorizer_uni3.transform(X_test['Reviews'])
X_test_U25 = vectorized_uni3.toarray()

In [37]:
y_pred_U25 = model_rf2.predict(X_test_U25)

In [38]:
target_names = ['1', '2', '3', '4', '5']
classify_report_4 = classification_report(X_test['Rating'], y_pred_U25, target_names=target_names)
print(classify_report_4)
print("Classfication Report: Unigrams & review.length < 25\n\n")

              precision    recall  f1-score   support

           1       0.41      0.86      0.56     21701
           2       0.30      0.12      0.17      7417
           3       0.32      0.16      0.21      9529
           4       0.42      0.17      0.24     18412
           5       0.82      0.78      0.80     67075

    accuracy                           0.62    124134
   macro avg       0.46      0.42      0.40    124134
weighted avg       0.62      0.62      0.59    124134

Classfication Report: Unigrams & review.length < 25




# **Performance of Random Forest Classifier(Test Data)**

   <table>
  <tr>
    <th>(Ngram, Review Length(less than))</th>
    <th>Accuracy</th>
    <th>Precision</th>
    <th>Recall</th>
    <th>F1-Score</th>
  </tr>
  <tr>
    <td>Unigram, 50</td>   
      <td>0.69</td>
    <td>0.69</td>
    <td>0.69</td>
      <td>0.66</td>
  </tr>
  <tr>
    <td>Unigram, 25</td>
    <td>0.62</td>
    <td>0.63</td>
    <td>0.62</td>
    <td>0.60</td>
  </tr>
</table>

# **Performance of Random Forest Classifier(Training Data)**

   <table>
  <tr>
    <th>(Ngram, Review Length(less than))</th>
    <th>Accuracy</th>
    <th>Precision</th>
    <th>Recall</th>
    <th>F1-Score</th>
  </tr>
  <tr>
    <td>Unigram, 50</td>   
      <td>0.93</td>
    <td>0.93</td>
    <td>0.93</td>
      <td>0.93</td>
  </tr>
  <tr>
    <td>Unigram, 25</td>
    <td>0.90</td>
    <td>0.90</td>
    <td>0.90</td>
    <td>0.89</td>
  </tr>
</table>

> * For greater review lengths, the classifier was consuming a lot of memory and time.
> * Bigram, trigram even for smaller lengths was resulting in large no. of features resulting in the same.

# **We will be doing the analysis of Multinomial Naive Bayes Classifier in another notebook.**