In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
dataset = pd.read_csv("small_reviews.csv")

In [3]:
dataset.head()

Unnamed: 0,Id,Review,Label
0,0,good and interesting,5
1,1,"This class is very helpful to me. Currently, I...",5
2,2,like!Prof and TAs are helpful and the discussi...,5
3,3,Easy to follow and includes a lot basic and im...,5
4,4,Really nice teacher!I could got the point eazl...,4


In [4]:
dataset['Label'].value_counts()

5    7139
4    1777
3     495
1     328
2     260
Name: Label, dtype: int64

In [5]:
 dataset.drop('Id',axis=1,inplace=True)

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
dataset['Col1_scaled'] = scaler.fit_transform(dataset['Label'].values.reshape(-1,1))
dataset.head(10)

Unnamed: 0,Review,Label,Col1_scaled
0,good and interesting,5,1.0
1,"This class is very helpful to me. Currently, I...",5,1.0
2,like!Prof and TAs are helpful and the discussi...,5,1.0
3,Easy to follow and includes a lot basic and im...,5,1.0
4,Really nice teacher!I could got the point eazl...,4,0.75
5,"Great course - I recommend it for all, especia...",5,1.0
6,One of the most useful course on IT Management!,5,1.0
7,I was disappointed because the name is mislead...,3,0.5
8,Super content. I'll definitely re-do the course,5,1.0
9,One of the excellent courses at Coursera for i...,5,1.0


In [7]:
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 9999):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SaintLaurentDon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 100000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values
y=y.astype('int')


In [18]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


In [19]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
# Predicting the Test set results
nb_pred = nb.predict(X_test)

In [21]:
# Making the Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix
cm = confusion_matrix(y_test, nb_pred)
print(cm)
print(classification_report(y_test,nb_pred))

[[ 309  258]
 [  85 1348]]
              precision    recall  f1-score   support

           0       0.78      0.54      0.64       567
           1       0.84      0.94      0.89      1433

    accuracy                           0.83      2000
   macro avg       0.81      0.74      0.77      2000
weighted avg       0.82      0.83      0.82      2000



In [22]:
x='This course is very good and very lovely'
print(nb.predict(cv.transform([x]))[0])

1


In [25]:
x='this is an average course. it is sometimes misleading too'
print(nb.predict(cv.transform([x]))[0])

0


In [26]:
#Fitting Random Forest to the training set
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rf.fit(X_train, y_train)

# Predicting the Test set results
rf_pred = rf.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, rf_pred)
print(cm)
print(classification_report(y_test,rf_pred))



[[ 296  271]
 [ 171 1262]]
              precision    recall  f1-score   support

           0       0.63      0.52      0.57       567
           1       0.82      0.88      0.85      1433

    accuracy                           0.78      2000
   macro avg       0.73      0.70      0.71      2000
weighted avg       0.77      0.78      0.77      2000



In [27]:
x="This course is the worst i have taken on coursera. They just want to mention everything in very short time. Quiz is much harder than material on video."
print(rf.predict(cv.transform([x]))[0])

0


In [28]:
x="This course is very poor and also contains lots of errors"
print(rf.predict(cv.transform([x]))[0])

0


In [29]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0,class_weight='balanced')
dt.fit(X_train, y_train)

# Predicting the Test set results
dt_pred = dt.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, dt_pred)
print(cm)
print(classification_report(y_test,rf_pred))

[[ 289  278]
 [ 288 1145]]
              precision    recall  f1-score   support

           0       0.63      0.52      0.57       567
           1       0.82      0.88      0.85      1433

    accuracy                           0.78      2000
   macro avg       0.73      0.70      0.71      2000
weighted avg       0.77      0.78      0.77      2000



In [30]:
x="This course is the worst i have taken on coursera. They just want to mention everything in very short time. Quiz is much harder than material on video."
print(dt.predict(cv.transform([x]))[0])

0


In [31]:
x="This course is very informative and good the teacher is very intelligent"
print(dt.predict(cv.transform([x]))[0])

1


In [32]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver = 'lbfgs', random_state = 0)
lr.fit(X_train, y_train)

# Predicting the Test set results
lr_pred = lr.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, lr_pred)
print(cm)
print(classification_report(y_test,lr_pred))

[[ 285  282]
 [ 114 1319]]
              precision    recall  f1-score   support

           0       0.71      0.50      0.59       567
           1       0.82      0.92      0.87      1433

    accuracy                           0.80      2000
   macro avg       0.77      0.71      0.73      2000
weighted avg       0.79      0.80      0.79      2000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [33]:
x="This course is the worst i have taken on coursera. They just want to mention everything in very short time. Quiz is much harder than material on video."
print(lr.predict(cv.transform([x]))[0])

0


In [34]:
x="This course is very informative and good the teacher is very intelligent"
print(dt.predict(cv.transform([x]))[0])

1
