In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("Final.csv")
df

Unnamed: 0,Rating,Reviews
0,5.0,"Very Good sound clarity with bass, build quali..."
1,4.0,Built quality is very good. Bass is a bit too ...
2,5.0,They are just awesome . no less then JBL. voca...
3,5.0,nice product with beautiful design and fast de...
4,4.0,Got it by the next day of order! Excellent ser...
...,...,...
87397,5.0,Five Stars
87398,5.0,Value for Money
87399,3.0,Poor
87400,5.0,It does not go above 100Mbps


In [None]:
# Adding a column(length) which indicates the length of Reviews column
df['length'] = df['Reviews'].str.len()
df.head(4)

Unnamed: 0,Rating,Reviews,length
0,5.0,"Very Good sound clarity with bass, build quali...",142.0
1,4.0,Built quality is very good. Bass is a bit too ...,222.0
2,5.0,They are just awesome . no less then JBL. voca...,171.0
3,5.0,nice product with beautiful design and fast de...,86.0


In [None]:
df.Reviews.dtypes


dtype('O')

In [None]:
df.Reviews = df.Reviews.apply(str)

In [None]:
pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-i0lpgl_s
  Running command git clone -q https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-i0lpgl_s
Building wheels for collected packages: preprocess-kgptalkie
  Building wheel for preprocess-kgptalkie (setup.py) ... [?25l[?25hdone
  Created wheel for preprocess-kgptalkie: filename=preprocess_kgptalkie-0.1.3-cp37-none-any.whl size=11759 sha256=4c667c939c42541f92d19a424f40825ff2b87c6cc506f6b0a58b116785cbbe96
  Stored in directory: /tmp/pip-ephem-wheel-cache-u_zzopp0/wheels/a8/18/22/90afa4bd43247fb9a75b710a4a3fcd94966c022ce9e3c7d0a6
Successfully built preprocess-kgptalkie
Installing collected packages: preprocess-kgptalkie
  Found existing installation: preprocess-kgptalkie 0.1.3
    Uninstalling preprocess-kgptalkie-0.1.3:
      Successfully uninstalled preprocess-kgptalkie-0.1.3
Successfully installed prep

In [None]:
import preprocess_kgptalkie as ps
import re

In [None]:
def clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x)
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    x = ps.remove_html_tags(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x

In [None]:
df['Reviews'] = df['Reviews'].apply(lambda x: clean(x))
df.head()

Unnamed: 0,Rating,Reviews,length
0,5.0,very good sound clarity with bass build qualit...,142.0
1,4.0,built quality is very good bass is a bit too h...,222.0
2,5.0,they are just awesome no less then jbl vocals ...,171.0
3,5.0,nice product with beautiful design and fast de...,86.0
4,4.0,got it by the next day of order excellent serv...,468.0


In [None]:
# Removing the stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords  #For stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
    stop_words = set(stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure'])
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

df.Reviews = df.Reviews.apply(remove_stopwords)
df.Reviews.head()

0    [good, sound, clarity, bass, build, quality, g...
1    [built, quality, good, bass, bit, high, listen...
2    [awesome, less, jbl, vocals, balanceclear, sou...
3    [nice, product, beautiful, design, fast, deliv...
4    [got, next, day, order, excellent, service, fl...
Name: Reviews, dtype: object

In [None]:
# Adding column of cleaned length after puncuations,stopwords removal
df['length of cleaned data'] = df['Reviews'].str.len()
df.head()

Unnamed: 0,Rating,Reviews,length,length of cleaned data
0,5.0,"[good, sound, clarity, bass, build, quality, g...",142.0,16
1,4.0,"[built, quality, good, bass, bit, high, listen...",222.0,23
2,5.0,"[awesome, less, jbl, vocals, balanceclear, sou...",171.0,15
3,5.0,"[nice, product, beautiful, design, fast, deliv...",86.0,8
4,4.0,"[got, next, day, order, excellent, service, fl...",468.0,48


In [None]:
# Stemming using Snowball
from nltk.stem import SnowballStemmer

def stem_text(text):
    snowball = SnowballStemmer('english')
    return " ".join([snowball.stem(w) for w in text])

df.Reviews = df.Reviews.apply(stem_text)
df.Reviews.head()

0    good sound clariti bass build qualiti good wir...
1    built qualiti good bass bit high listen longer...
2    awesom less jbl vocal balanceclear sound even ...
3    nice product beauti design fast deliveri thank...
4    got next day order excel servic flipkart earph...
Name: Reviews, dtype: object

In [None]:
df.to_csv("Cleaned_data.csv", index = False)

#Feature Extraction

In [None]:
# Importing the library and converting it into vectors
from sklearn.feature_extraction.text import TfidfVectorizer
# Using TfidfVectorizer to deal the frequent words
tf_vec = TfidfVectorizer(max_features=25000, ngram_range=(1,5), analyzer='char')

#Seperating into input and output variables
x = tf_vec.fit_transform(df['Reviews'])
y = df['Rating']

In [None]:
tf_vec.get_feature_names()[:20]

[' ',
 ' 0',
 ' 0 ',
 ' 0 b',
 ' 0 ba',
 ' 0 s',
 ' 0 st',
 ' 1',
 ' 1 ',
 ' 1 c',
 ' 1 ca',
 ' 1 d',
 ' 1 da',
 ' 1 l',
 ' 1 la',
 ' 1 m',
 ' 1 mo',
 ' 1 s',
 ' 1 sp',
 ' 1 y']

In [None]:
tf_vec.get_params()

{'analyzer': 'char',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 25000,
 'min_df': 1,
 'ngram_range': (1, 5),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

#Training Classifiers

In [None]:
#Breaking our input and target variable into train and test data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
# Importing all the model library
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier
# Importing performance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Passing the best fit parameters
svc = LinearSVC(C=20, class_weight='balanced')
mnb = MultinomialNB(alpha = 1.0,fit_prior = True)
pac = PassiveAggressiveClassifier(C = 0.8, n_iter_no_change = 4)
rfc = RandomForestClassifier(n_estimators = 100,min_samples_split = 2)

In [None]:
# Evaluating the models using to find their performance metrics
def evaluate_metrics(model):
    model.fit(x_train,y_train)
    model.score(x_train,y_train)
    pred=model.predict(x_test)
    print('Accuracy score of',model,'is:')
    print(accuracy_score(y_test,pred))
    print(confusion_matrix(y_test,pred))
    print(classification_report(y_test,pred))

In [None]:
evaluate_metrics(svc)

Accuracy score of LinearSVC(C=20, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) is:
0.9877009324409358
[[    2     3    10     1     0]
 [    1     7     6     0     1]
 [    7    12   142    36     6]
 [    1    18    54  5544     5]
 [    0    19    29     6 11571]]
              precision    recall  f1-score   support

         1.0       0.18      0.12      0.15        16
         2.0       0.12      0.47      0.19        15
         3.0       0.59      0.70      0.64       203
         4.0       0.99      0.99      0.99      5622
         5.0       1.00      1.00      1.00     11625

    accuracy                           0.99     17481
   macro avg       0.58      0.65      0.59     17481
weighted avg       0.99      0.99      0.99     17481



In [None]:
evaluate_metrics(mnb)

Accuracy score of MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) is:
0.9835821749327842
[[    0     0    12     0     4]
 [    0     0     8     0     7]
 [    0     0   143    27    33]
 [    0     0   128  5454    40]
 [    0     0    23     5 11597]]
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00        16
         2.0       0.00      0.00      0.00        15
         3.0       0.46      0.70      0.55       203
         4.0       0.99      0.97      0.98      5622
         5.0       0.99      1.00      1.00     11625

    accuracy                           0.98     17481
   macro avg       0.49      0.53      0.51     17481
weighted avg       0.99      0.98      0.98     17481



In [None]:
evaluate_metrics(pac)

Accuracy score of PassiveAggressiveClassifier(C=0.8, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=1000, n_iter_no_change=4,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False) is:
0.9873004976831989
[[    1     0    13     2     0]
 [    0     1    11     2     1]
 [    3     8   122    60    10]
 [    0     0    55  5563     4]
 [    0     0    42    11 11572]]
              precision    recall  f1-score   support

         1.0       0.25      0.06      0.10        16
         2.0       0.11      0.07      0.08        15
         3.0       0.50      0.60      0.55       203
         4.0       0.99      0.99      0.99      5622
         5.0       1.00      1.00      1.00     11625

    accuracy                           0.99     17481
  

In [None]:
evaluate_metrics(rfc)

Accuracy score of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False) is:
0.9883301870602368
[[    1     0    10     3     2]
 [    0     1     4     4     6]
 [    2     6   117    70     8]
 [    0     0    44  5568    10]
 [    0     0    21    14 11590]]
              precision    recall  f1-score   support

         1.0       0.33      0.06      0.11        16
         2.0       0.14      0.07      0.09        15
         3.0       0.60      0.58      0.59       203
         4.0       0.98      

###### Observations: RandomForestClassifier is performing good So,we opt it as final model.

## Testing Predictions

In [None]:
x = 'Built quality is very good. Bass is a bit too '
x = clean(x)
tf_vec = tf_vec.transform([x])
rfc.predict(tf_vec)

array([4.])