In [1]:
# Import relevant libraries and packages
import pickle
import pandas as pd
import numpy as np
import seaborn as sns

import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import re
import emoji #install pip install emoji --upgrade

import spacy
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

import enchant
from enchant.checker import SpellChecker

# Phase 0 => Data setup

Creation of new dataset to run classification model with only 2 dimensions : 'review content' and 'rating'.

In [None]:
df_test1 = df[['review_content','rating']]

### A) Label creation

Since the dataset is inbalanced, 2 scenarii need to be designed depending on the success of ML prediction :
- 1 scenario with 5 categories corresponding to the 5 different scores (from 1 to 5)
- 1 scenario with 3 macro-categories gathering scores 

##### Scenario 1 => 5 categories

In [None]:
# function creating labels accroding to rating
def label_rating (row):
    if row['rating'] == 1 :
        return '1 - awful'
    if row['rating'] == 2 :
        return '2 - bad'
    if row['rating'] == 3 :
        return '3 - neutral'
    if row['rating'] == 4 :
        return '4 - good'
    if row['rating'] == 5 :
        return '5 - awesome'
    return 'Other'

In [None]:
# application of function to create labels
df_test1['rating_label'] = df_test1.apply (lambda row: label_rating(row), axis=1)
df_test1.head()

In [None]:
# creation of dictionary scenario 1
rating1_df = df_test1[['rating_label', 'rating']].drop_duplicates().sort_values('rating')
ratinglabel1_to_id = dict(rating1_df.values)
id_to_ratinglabel1 = dict(rating1_df[['rating', 'rating_label']].values)

In [None]:
ratinglabel1_to_id 

In [None]:
# distribution chart
fig = plt.figure(figsize=(8,6))
df_test1.groupby('rating_label').review_content.count().plot.bar(ylim=0)
plt.show()

##### Scenario 2 => 3 categories

- Bad = categories 1 & 2
- Neutral = category 3
- Good = categories 4 & 5

In [None]:
df_test2 = df[['review_content','rating']]

def label_rating2 (row):
    if row['rating'] == 1 :
        return 'bad'
    if row['rating'] == 2 :
        return 'bad'
    if row['rating'] == 3 :
        return 'neutral'
    if row['rating'] == 4 :
        return 'good'
    if row['rating'] == 5 :
        return 'good'
    return 'Other'

df_test2['rating_label'] = df_test2.apply (lambda row: label_rating2(row), axis=1)
df_test2.head()

In [None]:
# creation of dictionary scenario 2
rating2_df = df_test2[['rating_label', 'rating']].drop_duplicates().sort_values('rating')
ratinglabel2_to_id = dict(rating2_df.values)
id_to_ratinglabel2 = dict(rating2_df[['rating', 'rating_label']].values)

In [None]:
# distribution chart
fig = plt.figure(figsize=(8,6))
df_test2.groupby('rating_label').review_content.count().plot.bar(ylim=0)
plt.show()

# Phase 1 => Preparatory work

### A) Feature engineering

#### Step 1 => Features created directly from text

1. Number of Characters

In [97]:
def count_chars(text):
    return len(text)

2. Number of words

In [98]:
def count_words(text):
    return len(text.split())

3. Number of capital characters

In [130]:
def count_capital_chars(text):
    count=0
    for i in text:
        if i.isupper()== True: #isupper() method returns: True if characters in a string are uppercase characters
            count+=1
    return count

4. Number of capital words

In [152]:
def count_capital_words(text):
    return sum(map(str.isupper, text.split()))

5. Count the number of punctuations

In [104]:
def count_punctuations(text):
    punctuations="!#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
    d=dict()
    for i in punctuations:
        d[str(i)+'count']=text.count(i)
    return d 

6. Number of words in quotes

In [105]:
def count_words_in_quotes(text):
    x = re.findall("'.'|'.'", text)
    count=0
    if x is None:
        return 0
    else:
        for i in x:
            t=i[1:-1]
            count+=count_words(t)
        return count

7. Number of sentences

In [106]:
def count_sent(text):
    return len(nltk.sent_tokenize(text))

8. Count the number of unique words

In [107]:
def count_unique_words(text):
    return len(set(text.split()))

9. Count of hashtags

In [108]:
def count_htags(text):
    x = re.findall(r'(#w[A-Za-z0-9]*)', text)
    return len(x) 

10. Count of mentions

In [109]:
def count_mentions(text):
    x = re.findall(r'(@w[A-Za-z0-9]*)', text)
    return len(x)

11. Count of stopwords

In [112]:
nlp = spacy.load("en_core_web_sm") # need to use the medium model (not small one)
nlp.Defaults.stop_words -= {"no", "not", "isn't","can't", "cannot", "doesn't", "don't", "but", "won't", "shouldn't"} #to remove words form Spacy stopwords

stopwords = nlp.Defaults.stop_words

print(len(stopwords))

322


In [113]:
print(stopwords)

{'everywhere', 'here', 'off', 'back', 'seeming', 'over', 'first', 'other', 'onto', 'hundred', 'something', 'him', 'via', 'get', '’s', 'latter', 'seems', 'never', 'any', 'around', 'hereafter', 'somewhere', 'they', 'afterwards', 'often', 'otherwise', 'further', 'unless', 'we', 'call', 'until', '‘ve', 'he', 'really', 'thence', 'because', 'six', 'herself', 'ourselves', 'two', 'becomes', 'whereas', 'who', 'fifty', 'does', 'done', 'forty', 'it', "'ve", 'mine', 'up', 'third', 'again', 'am', 'least', 'together', 'anything', 'all', 'nine', 'be', 'and', 'into', 'give', 'could', 'else', 'between', '’re', 'towards', 'each', 'many', 'yourself', 'empty', 'side', 'latterly', 'bottom', 'front', 'must', 'hence', "'re", 'always', 'have', 'hereupon', 'part', 'make', 'just', 'us', 'about', 'beside', 'sixty', 'will', 'becoming', 'made', 'nevertheless', 'yet', "'m", 'anyway', 'after', 'due', 'became', 'before', 'much', '‘ll', 'yours', 'move', 'our', 'anyone', 'whoever', 'elsewhere', 'thru', 'been', 'them', 

In [120]:
def count_stopwords(text):
    stop_words = stopwords  
    word_tokens = word_tokenize(text)
    stopwords_x = [w for w in word_tokens if w in stop_words]
    return len(stopwords_x)

11. Count of emojis

In [None]:
def count_emoji(text):
    return emoji.emoji_count(text)

##### Application of 1st round of feature engineering

In [157]:
#apply all the functions just created above, to the whole dataset
df['char_count'] = df["text"].apply(lambda x:count_chars(x))
df['word_count'] = df["text"].apply(lambda x:count_words(x))
df['sent_count'] = df["text"].apply(lambda x:count_sent(x))
df['capital_char_count'] = df["text"].apply(lambda x:count_capital_chars(x))
df['capital_word_count'] = df["text"].apply(lambda x:count_capital_words(x))
df['quoted_word_count'] = df["text"].apply(lambda x:count_words_in_quotes(x))
df['stopword_count'] = df["text"].apply(lambda x:count_stopwords(x))
df['unique_word_count'] = df["text"].apply(lambda x:count_unique_words(x))
df['htag_count'] = df["text"].apply(lambda x:count_htags(x))
df['mention_count'] = df["text"].apply(lambda x:count_mentions(x))
df['punct_count'] = df["text"].apply(lambda x:count_punctuations(x))
df['emoji_count'] = df["text"].apply(lambda x:count_emoji(x))

In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12338 entries, 0 to 12337
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   text                12338 non-null  object        
 1   rating              12338 non-null  int64         
 2   date                12338 non-null  datetime64[ns]
 3   char_count          12338 non-null  int64         
 4   word_count          12338 non-null  int64         
 5   sent_count          12338 non-null  int64         
 6   capital_char_count  12338 non-null  int64         
 7   capital_word_count  12338 non-null  int64         
 8   quoted_word_count   12338 non-null  int64         
 9   stopword_count      12338 non-null  int64         
 10  unique_word_count   12338 non-null  int64         
 11  htag_count          12338 non-null  int64         
 12  mention_count       12338 non-null  int64         
 13  punct_count         12338 non-null  object    

#### Step 2 => New features created directly from features above and applied on df

12. Calculating average word length

In [159]:
df['avg_wordlength'] = df['char_count']/df['word_count']

13. Calculating average sentence length

In [160]:
df['avg_sentlength'] = df['word_count']/df['sent_count']

14. Ratio unique words vs word count

In [161]:
df['unique_vs_words'] = df['unique_word_count']/df['word_count']

15. Ratio stopwords count vs words count

In [162]:
df['stopwords_vs_words'] = df['stopword_count']/df['word_count']

In [163]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12338 entries, 0 to 12337
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   text                12338 non-null  object        
 1   rating              12338 non-null  int64         
 2   date                12338 non-null  datetime64[ns]
 3   char_count          12338 non-null  int64         
 4   word_count          12338 non-null  int64         
 5   sent_count          12338 non-null  int64         
 6   capital_char_count  12338 non-null  int64         
 7   capital_word_count  12338 non-null  int64         
 8   quoted_word_count   12338 non-null  int64         
 9   stopword_count      12338 non-null  int64         
 10  unique_word_count   12338 non-null  int64         
 11  htag_count          12338 non-null  int64         
 12  mention_count       12338 non-null  int64         
 13  punct_count         12338 non-null  object    

16. Adding columns based on punctuation used in each text

In [165]:
# Creating new dataframe retrieving all the different punctuation used in each text
df_punct = pd.DataFrame(list(df.punct_count))

# Merging punctuation DataFrame with main DataFrame
df = pd.merge(df, df_punct, left_index=True, right_index=True)

# Dropping "punct_count" column from main DataFrame
df.drop(columns=['punct_count'],inplace=True)
df.columns

Index(['text', 'rating', 'date', 'char_count', 'word_count', 'sent_count',
       'capital_char_count', 'capital_word_count', 'quoted_word_count',
       'stopword_count', 'unique_word_count', 'htag_count', 'mention_count',
       'avg_wordlength', 'avg_sentlength', 'unique_vs_words',
       'stopwords_vs_words', '!count', '#count', '$count', '%count', '&count',
       ''count', '(count', ')count', '*count', '+count', ',count', '-count',
       '.count', '/count', ':count', ';count', '<count', '=count', '>count',
       '?count', '@count', '[count', '\count', ']count', '^count', '_count',
       '`count', '{count', '|count', '}count', '~count'],
      dtype='object')

In [166]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12338 entries, 0 to 12337
Data columns (total 48 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   text                12338 non-null  object        
 1   rating              12338 non-null  int64         
 2   date                12338 non-null  datetime64[ns]
 3   char_count          12338 non-null  int64         
 4   word_count          12338 non-null  int64         
 5   sent_count          12338 non-null  int64         
 6   capital_char_count  12338 non-null  int64         
 7   capital_word_count  12338 non-null  int64         
 8   quoted_word_count   12338 non-null  int64         
 9   stopword_count      12338 non-null  int64         
 10  unique_word_count   12338 non-null  int64         
 11  htag_count          12338 non-null  int64         
 12  mention_count       12338 non-null  int64         
 13  avg_wordlength      12338 non-null  float64   

In [167]:
df.describe()

Unnamed: 0,rating,char_count,word_count,sent_count,capital_char_count,capital_word_count,quoted_word_count,stopword_count,unique_word_count,htag_count,...,[count,\count,]count,^count,_count,`count,{count,|count,}count,~count
count,12338.0,12338.0,12338.0,12338.0,12338.0,12338.0,12338.0,12338.0,12338.0,12338.0,...,12338.0,12338.0,12338.0,12338.0,12338.0,12338.0,12338.0,12338.0,12338.0,12338.0
mean,4.528692,92.415059,17.153509,1.791863,2.129438,0.382153,0.000162,7.404279,15.358972,0.0,...,0.0,8.1e-05,0.0,0.000648,0.000486,8.1e-05,0.0,0.0,0.0,0.000162
std,1.118871,103.510151,19.484878,1.297253,3.491026,1.053992,0.012731,10.289477,15.667021,0.0,...,0.0,0.009003,0.0,0.036007,0.028466,0.009003,0.0,0.0,0.0,0.012731
min,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,26.0,5.0,1.0,1.0,0.0,0.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,56.0,10.0,1.0,1.0,0.0,0.0,4.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.0,115.0,21.0,2.0,2.0,0.0,0.0,9.0,20.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5.0,1140.0,214.0,20.0,110.0,29.0,1.0,105.0,152.0,0.0,...,0.0,1.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,1.0


### B) Preprocessing methodology

In [None]:
# transform emojis into words
def emo_trans(text):
    text=emoji.demojize(text)
    text=text.replace(":"," ")
    text=' '.join(text.split())
    return text

In [None]:
# correct mispelling words
class Solution:
    def solve(self, s):
        seen = s[0]
        ans = s[0]
        for i in s[1:]:
            if i != seen:
                ans += i
                seen = i
        return ans

In [None]:
ob = Solution()

In [326]:
def preprocessing(text):
    text=text.lower() # to put in lower case
    text=' '.join(text.split()) # to remove extra white spaces (whichever how many)
    text=re.sub("'", "", text) # to avoid removing contractions in english
    text=emo_trans(text) # to transform emojis into words
    text=re.sub("@[A-Za-z0-9_]+","", text) # to remove mentions
    text=re.sub("#[A-Za-z0-9_]+","", text) # to remove hashtags
    text=re.sub(r"http\S+", "", text) # to remove urls
    text=re.sub(r"www.\S+", "", text) # to remove urls
    text=re.sub('((www.[^s]+)|(https?://[^s]+))',' ',text) # to remove urls - 3rd version
    text=re.sub("[^a-z0-9]"," ", text) # to remove non-alphanumerical characters
    text=ob.solve(text)
    text=correcter_words(text)
    tokens = word_tokenize(text) # to tokenize
    tokens_no_punctuation = [t for t in tokens if t.isalpha()]
    tokens_no_stop = [t for t in tokens_no_punctuation if t not in stopwords]
    lemmatizer = WordNetLemmatizer()
    token_lem = [lemmatizer.lemmatize(t) for t in tokens_no_stop]
    return token_lem

[nltk_data] Downloading package punkt to /home/vb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### C) Separating reviews per rating level

In [171]:
# df with 1-star reviews
df_1star = df.loc[df['rating']==1]
df_1star.shape

(834, 48)

In [172]:
# df with 2-star reviews
df_2star = df.loc[df['rating']==2]
df_2star.shape

(267, 48)

In [173]:
# df with 3-star reviews
df_3star = df.loc[df['rating']==3]
df_3star.shape

(380, 48)

In [174]:
# df with 4-star reviews
df_4star = df.loc[df['rating']==4]
df_4star.shape

(918, 48)

In [175]:
# df with 5-star reviews
df_5star = df.loc[df['rating']==5]
df_5star.shape

(9939, 48)

### D) Preprocessing per rating level

### E) TF-IDF

extracting features from text is to use the bag of words model: a model where for each document, a complaint narrative in our case, the presence (and often the frequency) of words is taken into consideration, but the order in which they occur is ignored.

Specifically, for each term in our dataset, we will calculate a measure called Term Frequency, Inverse Document Frequency, abbreviated to tf-idf. We will use sklearn.feature_extraction.text.TfidfVectorizer to calculate a tf-idf vector for each of consumer complaint narratives:

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 4), stop_words='english')
features = tfidf.fit_transform(df_test1['review_content']).toarray()
labels = df_test1['rating']
features.shape

Now, each of 22968 consumer reviews narratives is represented by 8728 features, representing the tf-idf score for different unigrams and bigrams.

### F) N-grams

We can use sklearn.feature_selection.chi2 to find the terms that are the most correlated with each of the rating_label:

In [None]:
N = 5 #top x associated words to be displayed
for rating_label, rating in sorted(ratinglabel_to_id.items()):
    features_chi2 = chi2(features, labels == rating)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names_out())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
    quadrigrams = [v for v in feature_names if len(v.split(' ')) == 4]
    print("# '{}' star :".format(rating))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
    print("  . Most correlated trigrams:\n. {}".format('\n. '.join(trigrams[-N:])))
    print("  . Most correlated quadrigrams:\n. {}".format('\n. '.join(quadrigrams[-N:])))

### G) Concatenation

# Phase 2 => Topic Classification

### Step 1 - Dataset augmentation

### Step 2 - Dataset splitting between train/test/validation

In [39]:
# rating distribution 
df['rating'].value_counts()/df.shape[0]*100

5    80.556006
4     7.440428
1     6.759604
3     3.079916
2     2.164046
Name: rating, dtype: float64

In [40]:
X = df.drop("rating", axis = 1)
y = df['rating']

In [43]:
# set aside 20% of train and test data for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify= y, test_size=0.2, shuffle = True, 
                                                    random_state = 8)

# Use the same function above for the validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify= y_train, test_size=0.25, random_state= 8) # 0.25 x 0.8 = 0.2


print("X_train shape: {}".format(X_train.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_train shape: {}".format(y_train.shape))
print("y_test shape: {}".format(y_test.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))

X_train shape: (7402, 2)
X_test shape: (2468, 2)
y_train shape: (7402,)
y_test shape: (2468,)
X_val shape: (2468, 2)
y_val shape: (2468,)


In [45]:
y_train.value_counts()/y_train.shape[0]*100

5    80.559308
4     7.430424
1     6.754931
3     3.080249
2     2.175088
Name: rating, dtype: float64

In [46]:
y_test.value_counts()/y_test.shape[0]*100

5    80.551053
4     7.455429
1     6.766613
3     3.079417
2     2.147488
Name: rating, dtype: float64

In [47]:
y_val.value_counts()/y_val.shape[0]*100

5    80.551053
4     7.455429
1     6.766613
3     3.079417
2     2.147488
Name: rating, dtype: float64

### Step 3 - ML Classifications applied

We are now ready to experiment with different machine learning models, evaluate their accuracy and find the source of any potential issues.

We will benchmark the following 5 models:

- Logistic Regression
- (Multinomial) Naive Bayes
- Linear Support Vector Machine
- Random Forest
- K-Nearest Neighbor (KNN)

In [None]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    KNeighborsClassifier(n_neighbors = 5),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

CV = 5 #nb of model
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
        
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])



sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
cv_df.groupby('model_name').accuracy.mean()

### Step 4 - Performance ML Classification models comparison

2 models seem to perform well : SVC & Logistic regression.
We are going to look deeper in their performance.

In [None]:
model = LinearSVC() 
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.2, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

X_train, X_val, y_train, y_val, indices_train, indices_val = train_test_split(features, labels, df.index, test_size=0.2, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

#from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=rating_df['rating'].values, yticklabels=rating_df['rating'].values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
#from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))

In [None]:
model2 = LogisticRegression()
#X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df_test1.index, test_size=0.2, random_state=0)
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)
conf_mat2 = confusion_matrix(y_test, y_pred2)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=rating_df['rating_label'].values, yticklabels=rating_df['rating_label'].values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
print(metrics.classification_report(y_test, y_pred2))

**Conclusion** => best ML is SVC. So this will be applied on Twitter reviews

# Phase 3 => Application on tweets