In [None]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame

import string
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
df = pd.read_csv('../input/shopee-code-league-20/_DS_Sentiment_Analysis/train.csv',names=['message','rating'])
df = df.iloc[1:]

In [None]:
df.head()

# Basic Exploratory Analysis

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.groupby('rating').describe()

In [None]:
df['length'] = df['message'].apply(len)
df.head()

In [None]:
df['length'].plot(bins=50,kind='hist')

In [None]:
df['length'].describe()

In [None]:
df[df['length']==1249]['message'].iloc[0]

Seems like a lot of spamming in reviews.

Let's check if the length of message has any impact on rating.

In [None]:
df.hist(column='length',by='rating',bins=50,figsize=(10,8))

The graphs implies, there is no much relation between length of message and the rating corresponding to it.

# Text Pre-processing

In [None]:
def textprocess(mess):
    """Removing punctuation """
    nonpunc = [char for char in mess if char not in string.punctuation]
    nonpunc = ''.join(nonpunc)
    
    """Removing stopwords"""
    clean_mess = [word for word in nonpunc.split() if word.lower() not in stopwords.words('english')]
    
    return clean_mess

# Count_Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bow_transformer = CountVectorizer(analyzer=textprocess).fit(df['message'])

In [None]:
print(len(bow_transformer.vocabulary_))

In [None]:
messages_bow = bow_transformer.transform(df['message'])

# Tf-idf vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

# Training a Model

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
senti_analysis = MultinomialNB().fit(messages_tfidf,df['rating'])

In [None]:
all_predictions = senti_analysis.predict(messages_tfidf)

# Model Analysis [train data]

In [None]:
d = {'Predicted':all_predictions,'Actual':df['rating']}
df_analysis = DataFrame(d)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(df['rating'],all_predictions))

# Model Analysis [test accuracy]

In [None]:
from sklearn.model_selection import train_test_split

msg_train,msg_test,rating_train,rating_test = train_test_split(df['message'],df['rating'])

print(len(msg_train), len(msg_test), len(msg_train),len(msg_test))

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=textprocess)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

In [None]:
pipeline.fit(msg_train,rating_train)

In [None]:
pred_rate = pipeline.predict(msg_test)

In [None]:
print(classification_report(rating_test,pred_rate))

# Prediction

In [None]:
df1 = pd.read_csv('../input/shopee-code-league-20/_DS_Sentiment_Analysis/test.csv',index_col=['review_id'])
df1 = df1.rename(columns={'review':'message'})

In [None]:
prediction = pipeline.predict(df1['message'])

In [None]:
df1['Rating'] = prediction
df1