In [1]:
import numpy as np  
import pandas as pd
import re  
import nltk  
nltk.download('stopwords')  
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saikatb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv('OnePlus6T.labelled-1.csv')

In [3]:
data.head()

Unnamed: 0,reviews,intent
0,phone is simply superb in all aspects,product
1,low light performance of the camera is outstan...,product
2,you simply cannot go wrong with this phone,product
3,i got this phone on friday evening,product
4,pros: great battery life amazing performance p...,product


In [4]:
X = data.reviews
y = data.intent.astype('category')

In [5]:
dict( enumerate(y.cat.categories) )

{0: 'delivery', 1: 'product', 2: 'seller'}

In [6]:
y = y.cat.codes

In [7]:
documents = []

for sen in range(0, len(X)):  
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # remove all digits
    document = re.sub(r'\d+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
    document = document.split()

    document = [lemmatizer.lemmatize(word) for word in document]
    document = ' '.join(document)

    documents.append(document)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer  
vectorizer = CountVectorizer(max_features=100, min_df=3, max_df=0.7, stop_words=stopwords.words('english'))  
X = vectorizer.fit_transform(documents).toarray()  

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer  
tfidfconverter = TfidfTransformer()  
X = tfidfconverter.fit_transform(X).toarray()  

In [10]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [12]:
X_res_test, y_res_test = sm.fit_resample(X_test, y_test)

In [13]:
def predict_report(model):
    model.fit(X_res, y_res)
    scores = cross_val_score(model, X_res, y_res, cv=10)
    from sklearn.metrics import accuracy_score
    y_pred = cross_val_predict(model, X_res_test, y_res_test, cv=10) #clf3.predict(X_test)  

    print(accuracy_score(y_res_test, y_pred)) 
    print(metrics.classification_report(y_res_test, y_pred,target_names=['delivery','product','seller']))

In [14]:
def prediction(model, text):
    model.fit(X_res, y_res)
    documents = []

    for sen in range(0, len([text])):  
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str([text][sen]))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # remove all digits
        document = re.sub(r'\d+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        document = document.split()

        document = [lemmatizer.lemmatize(word) for word in document]
        document = ' '.join(document)

        documents.append(document)
    vectorizer = CountVectorizer(max_features=100, stop_words=stopwords.words('english'))  
    X = vectorizer.fit_transform(documents).toarray() 
    tfidfconverter = TfidfTransformer()  
    X = tfidfconverter.fit_transform(X).toarray()
    X_test = np.zeros(100)
    for i,d in enumerate(X[0]):
        X_test[i] = d
    return model.predict([X_test])

# SVC

In [15]:
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.svm import SVC
from sklearn import metrics
clf1 = SVC(C=5.0, gamma='auto', kernel='rbf')
predict_report(clf1)

0.8484848484848485
              precision    recall  f1-score   support

    delivery       0.76      0.86      0.81        44
     product       0.84      0.82      0.83        44
      seller       0.97      0.86      0.92        44

   micro avg       0.85      0.85      0.85       132
   macro avg       0.86      0.85      0.85       132
weighted avg       0.86      0.85      0.85       132



# Naive Bayes

In [16]:
from sklearn.naive_bayes import MultinomialNB 
clf2 = MultinomialNB()
predict_report(clf2)

0.8257575757575758
              precision    recall  f1-score   support

    delivery       0.76      1.00      0.86        44
     product       1.00      0.59      0.74        44
      seller       0.81      0.89      0.85        44

   micro avg       0.83      0.83      0.83       132
   macro avg       0.86      0.83      0.82       132
weighted avg       0.86      0.83      0.82       132



# Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier
clf3 = RandomForestClassifier()
predict_report(clf3)



0.8939393939393939
              precision    recall  f1-score   support

    delivery       0.93      0.93      0.93        44
     product       0.84      0.86      0.85        44
      seller       0.91      0.89      0.90        44

   micro avg       0.89      0.89      0.89       132
   macro avg       0.89      0.89      0.89       132
weighted avg       0.89      0.89      0.89       132



# KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier
clf4 = KNeighborsClassifier(n_neighbors=5)
predict_report(clf4) 

0.8257575757575758
              precision    recall  f1-score   support

    delivery       0.76      0.86      0.81        44
     product       0.79      0.70      0.75        44
      seller       0.93      0.91      0.92        44

   micro avg       0.83      0.83      0.83       132
   macro avg       0.83      0.83      0.83       132
weighted avg       0.83      0.83      0.83       132

