# Importing and Installing Models

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing datasets

In [None]:
data_train_nlp = pd.read_csv(open('train.csv', 'rb'))
data_test_nlp = pd.read_csv(open('test.csv', 'rb'))
sample_sub_nlp = pd.read_csv(open('sample_submission.csv', 'rb'))

In [None]:
data_train_nlp.head(2)

In [None]:
data_test_nlp.head(2)

# Analyzing

In [None]:
print(data_train_nlp.columns.tolist())

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import ensemble 
from sklearn.metrics import classification_report, accuracy_score

# Nulls

In [None]:
col2 = data_train_nlp.isnull().sum()
col1 = train_nlp.columns
nulls_train = {'columns': col1, '# of nulls': col2}
nulls_train = pd.DataFrame(nulls_train).reset_index(inplace = False).drop('index', axis = 1)
nulls_train

In [None]:
col2 = data_test_nlp.isnull().sum()
col1 = test_nlp.columns
nulls_test = {'columns': col1, '# of nulls': col2}
nulls_test = pd.DataFrame(nulls_test).reset_index(inplace = False).drop('index', axis = 1)
nulls_test

In [None]:
# fig_nulls, ax = plt.subplots(1,2, figsize = (20,10))
ax[0].bar(nulls_train['columns'],nulls_train['# of nulls'], width = 0.5, color = 'orange')
# titles2
ax[0].legend()
ax[0].set_title('Number of nulls in trainset')
ax[0].set_xlabel('column names')
ax[0].set_ylabel('number of nulls')
# x and y axes
#ax[0].set_ylim(0,5)
ax[1].bar(nulls_test['columns'],nulls_test['# of nulls'], width = 0.5, color = 'green')
# titles2
ax[1].legend()
ax[1].set_title('Number of nulls in testset')
ax[1].set_xlabel('column names')
ax[1].set_ylabel('number of nulls')
# x and y axes
#ax[0].set_ylim(0,5)

# Processing

In [None]:
train_nlp = data_train_nlp
test_nlp = data_test_nlp

In [None]:
train_nlp['nReview'] =train_nlp['Review'].fillna(value='')
test_nlp['nReview'] =test_nlp['Review'].fillna(value='')
train_nlp['nReview_Title'] =train_nlp['Review_Title'].fillna(value='')
test_nlp['nReview_Title'] =test_nlp['Review_Title'].fillna(value='')

In [None]:
train_nlp['nReview'] = train_nlp['nReview'].apply(lambda x: " ".join(x.lower() for x in x.split()))
test_nlp['nReview'] = test_nlp['nReview'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train_nlp['nReview_Title'] = train_nlp['nReview_Title'].apply(lambda x: " ".join(x.lower() for x in x.split()))
test_nlp['nReview_Title'] = test_nlp['nReview_Title'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [None]:
train_nlp.isna().sum(axis=0)

In [None]:
test_nlp.isna().sum(axis=0)

# Age

In [None]:
print("Average age for test data:", test_nlp['Age'].mean())
print("Average age for train data:", train_nlp['Age'].mean())

In [None]:
plt.hist(train_nlp['Age'])

In [None]:
plt.hist(test_nlp['Age'])

In [None]:
print(test_nlp.drop(['Id','Pos_Feedback_Cnt' ], axis=1).groupby(['Division'], as_index = False).mean(['Age']).sort_values(['Age']))
print('=====================================================================================================================')
print(train_nlp.drop(['Id','Pos_Feedback_Cnt' ], axis=1).groupby(['Division'], as_index = False).mean(['Age']).sort_values(['Age']))

In [None]:
print(test_nlp.drop(['Id','Pos_Feedback_Cnt' ], axis=1).groupby(['Department'], as_index = False).mean(['Age']).sort_values(['Age']))
print('=====================================================================================================================')
print(train_nlp.drop(['Id','Pos_Feedback_Cnt' ], axis=1).groupby(['Department'], as_index = False).mean(['Age']).sort_values(['Age']))

In [None]:
print(test_nlp.drop(['Id','Pos_Feedback_Cnt' ], axis=1).groupby(['Product_Category'], as_index = False).mean(['Age']).sort_values(['Age']))
print('=====================================================================================================================')
print(train_nlp.drop(['Id','Pos_Feedback_Cnt' ], axis=1).groupby(['Product_Category'], as_index = False).mean(['Age']).sort_values(['Age']))
#all the product types in test set exists in trainset as well

In [None]:
train_nlp.drop(['Id','Pos_Feedback_Cnt' ], axis=1).groupby(['Rating'], as_index = False).mean().sort_values(['Rating'])

#age does not give much insight in this case
#recpmmendation probability increases more after rating = 3
#rating recommendation relation seems reasonable

In [None]:
fig_age, ax = plt.subplots(figsize = (10,10))
ax.bar(train_nlp['Rating'],train_nlp['Age'], width = 0.5, color = 'orange')
# titles2
ax.legend()
ax.set_title('Rating and Age')
ax.set_xlabel('Rating')
ax.set_ylabel('Age')

In [None]:
fig_rec, ax = plt.subplots(figsize = (10,10))
ax.bar(train_nlp['Rating'],train_nlp['Age'], width = 0.5, color = 'blue')
# titles2
ax.legend()
ax.set_title('Rating and Recommendation')
ax.set_xlabel('Rating')
ax.set_ylabel('Recommendation rate')

In [None]:
print(train_nlp['Pos_Feedback_Cnt'].max())
print(train_nlp['Pos_Feedback_Cnt'].nunique())

# Rating and Recommendation

In [None]:
print('Average recommendation:',train_nlp['Recommended'].mean())
print('Average rating:',train_nlp['Rating'].mean())
print('Minimum rating:',train_nlp['Rating'].min())
print('Maximum rating:',train_nlp['Rating'].max())

In [None]:
print('Number of not recommended:',train_nlp[train_nlp['Recommended'] == 0]['Recommended'].count())
print('Number of recommended:',train_nlp[train_nlp['Recommended'] == 1]['Recommended'].count())

In [None]:
print('Number of rating 1:',train_nlp[train_nlp['Rating'] == 1]['Rating'].count())
print('Number of rating 2:',train_nlp[train_nlp['Rating'] == 2]['Rating'].count())
print('Number of rating 3:',train_nlp[train_nlp['Rating'] == 3]['Rating'].count())
print('Number of rating 4:',train_nlp[train_nlp['Rating'] == 4]['Rating'].count())
print('Number of rating 5:',train_nlp[train_nlp['Rating'] == 5]['Rating'].count())

In [None]:
len(train_nlp['Review'])

In [None]:
import string as str

In [None]:
opinion = {'Words':  ['cute', 'good', 'bad', 'terrible', 'ridiculous', 'extraordinary', 'love', 'do not know', 'do not love', "don't know", "dont know", "don't love", "magnificent", "happy", "terrific", "worse"
                              ],
        
        'Number of words':  [train_nlp['Review'].str.contains('cute', case = False).sum(),
train_nlp['Review'].str.contains('good', case = False).sum(),
train_nlp['Review'].str.contains('bad', case = False).sum(),
train_nlp['Review'].str.contains('terrible', case = False).sum(),
train_nlp['Review'].str.contains('ridiculous', case = False).sum(),
train_nlp['Review'].str.contains('extraordinary', case = False).sum(),
train_nlp['Review'].str.contains('love', case = False).sum(),
train_nlp['Review'].str.contains('do not know', case = False).sum(),
train_nlp['Review'].str.contains('do not love', case = False).sum(),
train_nlp['Review'].str.contains("don't know", case = False).sum(),
train_nlp['Review'].str.contains("dont know", case = False).sum(),
train_nlp['Review'].str.contains("don't love", case = False).sum(),
train_nlp['Review'].str.contains("magnificent", case = False).sum(),
train_nlp['Review'].str.contains("happy", case = False).sum(),
train_nlp['Review'].str.contains("terrific", case = False).sum(),
train_nlp['Review'].str.contains("worse", case = False).sum()
                              ],
        'Average recommendation rate': [train_nlp[train_nlp['Review'].str.contains('cute', case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains('good', case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains('bad', case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains('terrible', case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains('ridiculous', case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains('extraordinary', case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains('love', case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains('do not know', case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains('do not love', case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("don't know", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("dont know", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("don't love", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("magnificent", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("happy", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("terrific", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("worse", case = False) == True]['Recommended'].mean()
],
        'Average Review Score': [train_nlp[train_nlp['Review'].str.contains('cute', case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains('good', case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains('bad', case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains('terrible', case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains('ridiculous', case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains('extraordinary', case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains('love', case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains('do not know', case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains('do not love', case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("don't know", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("dont know", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("don't love", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("magnificent", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("happy", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("terrific", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("worse", case = False) == True]['Rating'].mean()
] }
opinion_tab = pd.DataFrame (opinion, columns = ['Words', 'Number of words', 'Average recommendation rate',  'Average Review Score'])

opinion_tab

In [None]:
season = {'Words':  ["fall", "autumn", "spring", "winter", "summer"
],
        
        'Number of words':  [train_nlp['Review'].str.contains("fall", case = False).sum(),
train_nlp['Review'].str.contains("autumn", case = False).sum(),
train_nlp['Review'].str.contains("spring", case = False).sum(),
train_nlp['Review'].str.contains("winter", case = False).sum(),
train_nlp['Review'].str.contains("summer", case = False).sum()
],
        'Average recommendation rate': [train_nlp[train_nlp['Review'].str.contains("fall", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("autumn", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("spring", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("winter", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("summer", case = False) == True]['Recommended'].mean()
],
        'Average Review Score': [train_nlp[train_nlp['Review'].str.contains("fall", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("autumn", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("spring", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("winter", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("summer", case = False) == True]['Rating'].mean()
] }
season_tab = pd.DataFrame (season, columns = ['Words', 'Number of words', 'Average recommendation rate',  'Average Review Score'])

season_tab

In [None]:
color = {'Words':  ["green", "red", "yellow", "blue", "black", "white", "orange", "violet", "lilac", "purple", "marine", "dark", "light", "pink", "pastel", "brown"
],
    
        'Number of words':  [train_nlp['Review'].str.contains("green", case = False).sum(),
train_nlp['Review'].str.contains("red", case = False).sum(),
train_nlp['Review'].str.contains("yellow", case = False).sum(),
train_nlp['Review'].str.contains("blue", case = False).sum(),
train_nlp['Review'].str.contains("black", case = False).sum(),
train_nlp['Review'].str.contains("white", case = False).sum(),
train_nlp['Review'].str.contains("orange", case = False).sum(),
train_nlp['Review'].str.contains("violet", case = False).sum(),
train_nlp['Review'].str.contains("lilac", case = False).sum(),
train_nlp['Review'].str.contains("purple", case = False).sum(),
train_nlp['Review'].str.contains("marine", case = False).sum(),
train_nlp['Review'].str.contains("dark", case = False).sum(),
train_nlp['Review'].str.contains("light", case = False).sum(),
train_nlp['Review'].str.contains("pink", case = False).sum(),
train_nlp['Review'].str.contains("pastel", case = False).sum(),
train_nlp['Review'].str.contains("brown", case = False).sum()
],
        'Average recommendation rate': [train_nlp[train_nlp['Review'].str.contains("green", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("red", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("yellow", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("blue", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("black", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("white", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("orange", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("violet", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("lilac", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("purple", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("marine", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("dark", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("light", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("pink", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("pastel", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("brown", case = False) == True]['Recommended'].mean()
],
        'Average Review Score': [train_nlp[train_nlp['Review'].str.contains("green", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("red", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("yellow", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("blue", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("black", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("white", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("orange", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("violet", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("lilac", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("purple", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("marine", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("dark", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("light", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("pink", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("pastel", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("brown", case = False) == True]['Rating'].mean()
] }
color_tab = pd.DataFrame (color, columns = ['Words','Number of words', 'Average recommendation rate',  'Average Review Score'])

color_tab

In [None]:
material = {'Words':  ["wool", "cotton", "silk", "leather"
],
        
    'Number of words':  [train_nlp['Review'].str.contains("wool", case = False).sum(),
train_nlp['Review'].str.contains("cotton", case = False).sum(),
train_nlp['Review'].str.contains("silk", case = False).sum(),
train_nlp['Review'].str.contains("leather", case = False).sum()
],
        'Average recommendation rate': [train_nlp[train_nlp['Review'].str.contains("wool", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("cotton", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("silk", case = False) == True]['Recommended'].mean(),
train_nlp[train_nlp['Review'].str.contains("leather", case = False) == True]['Recommended'].mean()
],
        'Average Review Score': [train_nlp[train_nlp['Review'].str.contains("wool", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("cotton", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("silk", case = False) == True]['Rating'].mean(),
train_nlp[train_nlp['Review'].str.contains("leather", case = False) == True]['Rating'].mean()
] }
material_tab = pd.DataFrame (material, columns = ['Words', 'Number of words', 'Average recommendation rate',  'Average Review Score'])

material_tab

In [None]:
plt.ioff()
fig_mat, ax = plt.subplots(1,3, figsize = (20,10))
e = material_tab['Average Review Score']
f = material_tab['Average recommendation rate']
g = material_tab['Number of words']
h = material_tab['Words']
ax[0].bar(h,e, width = 0.5, color = 'orange')
# titles2
ax[0].legend()
ax[0].set_title('Rating')
ax[0].set_xlabel('Material')
ax[0].set_ylabel('Average rating Score')
# x and y axes
ax[0].set_ylim(0,5)
ax[1].bar(h,f, width = 0.5, color = 'blue')
# titles
ax[1].legend()
ax[1].set_title('Recommendation')
ax[1].set_xlabel('Material')
ax[1].set_ylabel('Average recommedation rate')
# x and y axes
ax[1].set_ylim(0,1)
ax[2].bar(h,g, width = 0.5, color = 'green')
# titles2
ax[2].legend()
ax[2].set_title('Words')
ax[2].set_xlabel('Material')
ax[2].set_ylabel('Number of words')
# x and y axes
ax[0].set_ylim(0,5)

In [None]:
fig_col, ax = plt.subplots(1,3, figsize = (20,10))
i = color_tab['Average Review Score']
j = color_tab['Average recommendation rate']
k = color_tab['Number of words']
l = color_tab['Words']
ax[0].barh(l,i, height = 0.5, color = 'orange')
# titles2
ax[0].legend()
ax[0].set_title('Rating')
ax[0].set_ylabel('Colors')
ax[0].set_xlabel('Average rating Score')
# x and y axes
#ax[0].set_ylim(0,5)
ax[1].barh(l,j, height = 0.5, color = 'blue')
# titles
ax[1].legend()
ax[1].set_title('Recommendation')
ax[1].set_ylabel('Colors')
ax[1].set_xlabel('Average recommedation rate')
# x and y axes
#ax[1].set_ylim(0,1)
ax[2].barh(l,k, height = 0.5, color = 'green')
# titles2
ax[2].legend()
ax[2].set_title('Words')
ax[2].set_ylabel('Colors')
ax[2].set_xlabel('Number of words')
# x and y axes
#ax[0].set_ylim(0,5
# if we do not write it, it shows an ordered pair over the graph
plt.show()

In [None]:
fig_opn, ax = plt.subplots(1,3, figsize = (20,10))
a = opinion_tab['Average Review Score']
b = opinion_tab['Average recommendation rate']
c = opinion_tab['Number of words']
d = opinion_tab['Words']
ax[0].barh(d,a, height = 0.5, color = 'orange')
# titles2
ax[0].legend()
ax[0].set_title('Rating')
ax[0].set_ylabel('Colors')
ax[0].set_xlabel('Average rating Score')
# x and y axes
#ax[0].set_ylim(0,5)
ax[1].barh(d,b, height = 0.5, color = 'blue')
# titles
ax[1].legend()
ax[1].set_title('Recommendation')
ax[1].set_ylabel('Colors')
ax[1].set_xlabel('Average recommedation rate')
# x and y axes
#ax[1].set_ylim(0,1)
ax[2].barh(d,c, height = 0.5, color = 'green')
# titles2
ax[2].legend()
ax[2].set_title('Words')
ax[2].set_ylabel('Colors')
ax[2].set_xlabel('Number of words')
# x and y axes
#ax[0].set_ylim(0,5)
# if we do not write it, it shows an ordered pair over the graph
plt.show()

In [None]:
fig_seas, ax = plt.subplots(1,3, figsize = (20,10))
m = season_tab['Average Review Score']
n = season_tab['Average recommendation rate']
o = season_tab['Number of words']
p = season_tab['Words']
ax[0].bar(p,m, width = 0.5, color = 'orange')
# titles2
ax[0].legend()
ax[0].set_title('Rating')
ax[0].set_xlabel('Material')
ax[0].set_ylabel('Average rating Score')
# x and y axes
ax[0].set_ylim(0,5)
ax[1].bar(p,n, width = 0.5, color = 'blue')
# titles
ax[1].legend()
ax[1].set_title('Recommendation')
ax[1].set_xlabel('Material')
ax[1].set_ylabel('Average recommedation rate')
# x and y axes
ax[1].set_ylim(0,1)
ax[2].bar(p,o, width = 0.5, color = 'green')
# titles2
ax[2].legend()
ax[2].set_title('Words')
ax[2].set_xlabel('Material')
ax[2].set_ylabel('Number of words')
# x and y axes
ax[0].set_ylim(0,5)
# if we do not write it, it shows an ordered pair over the graph
plt.show()

# Correlation

In [None]:
sns.heatmap(train_nlp.corr(),linewidths=0.5,vmax=1, linecolor='black', annot=True)

In [None]:
sns.heatmap(test_nlp.corr(),linewidths=0.5,vmax=1, linecolor='black', annot=True)