In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import *
import wordcloud
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings("ignore")

In [1]:
df = pd.read_csv("../input/iba-ml1-final-project/train.csv", index_col=0)
df.head()

In [1]:
columns_d=df.columns
columns_d

#  EDA

In [1]:
for col in columns_d:
    missing = np.mean(df[col].isna())
    print('{} - {}%'.format(col, round(missing*100)))

In [1]:
df.corr()

In [1]:
corr_with_TARGET_1 = df.corr().iloc[:-1, -1].sort_values(ascending=False)
corr_with_TARGET_1

In [1]:
corr_with_TARGET_2 = df.corr().iloc[-2, [0,1,3]].sort_values(ascending=False)
corr_with_TARGET_2

In [1]:
sns.set(font_scale=1)
plt.figure(figsize=(10, 10))
sns.barplot(x=corr_with_TARGET_1.values, y=corr_with_TARGET_1.index)
plt.title('Correlation with target variable => Recommended')

In [1]:
sns.set(font_scale=1)
plt.figure(figsize=(10, 10))
sns.barplot(x=corr_with_TARGET_2.values, y=corr_with_TARGET_2.index)
plt.title('Correlation with target variable => Recommended')

In [1]:
df.info()

In [1]:
df.describe()

In [1]:
df.columns

In [1]:
# The age distribution in data
# Hide grid lines
plt.grid(False)
plt.hist(df['Age'], color='green', label = "Age")
plt.legend()
plt.xlabel("Age")
plt.ylabel("Count")
plt.title("Age Distribution in Data")

In [1]:
plt.grid(False)
sns.countplot(x = 'Division', data = df)

In [1]:
plt.grid(False)
sns.countplot(x = 'Department', data = df)

In [1]:
plt.figure(figsize=(10,10))
sns.boxplot(x = 'Rating', y = 'Age', data = df)

In [1]:
plt.figure(figsize=(10,10))
ax=df.Rating.value_counts()
labels=df['Rating'].value_counts().index
plt.pie(ax,labels=labels,autopct='%.2f')
plt.title("Rating percentage",fontsize=25,color='purple')
plt.legend()
plt.show()

In [1]:
print(df['Division'].unique())
print(df['Department'].unique())
print(df['Product_Category'].unique())

In [1]:
plt.figure(figsize=(10, 10))
sns.heatmap(df.isnull(), cmap="Blues")

In [1]:
df=df.fillna(' ')

In [1]:
rd = df[df['Recommended'] == 1] # recommended
nrd = df[df['Recommended'] == 0] # not recommended
rd.head()

In [1]:
plt.grid(False)
df["Recommended"].value_counts().plot(kind='bar')
plt.xlabel("Recommended")
plt.ylabel("Counts")
plt.title("Proportion Target Class")

In [1]:
sns.set(style="darkgrid")

sns.histplot(data = df
            ,x = 'Age'
            ,color = 'navy'
            ,kde = True
            ,bins = 10
            ,hue='Recommended'
            )
plt.show()

In [1]:
sns.heatmap(df.corr())

In [1]:
plt.figure(figsize=(18, 18))
plt.subplot2grid((2, 2), (0, 0))
plt.xticks(rotation=45)
plt.hist(rd['Rating'], color = "red", alpha = 0.5, label = "Recommended")
plt.hist(nrd['Rating'], color = "blue", alpha = 0.5, label = "Not Recommended")
plt.title("Recommended Items in Rating")
plt.legend()

In [1]:
plt.style.use('ggplot')

fig = plt.figure(figsize=(18, 18))
ax1 = plt.subplot2grid((2, 2), (0, 0))
ax1 = plt.xticks(rotation=45)
ax1 = plt.hist(rd['Division'], color = "red", alpha = 0.5, label = "Recommended")
ax1 = plt.hist(nrd['Division'], color = "blue", alpha = 0.5, label = "Not Recommended")
ax1 = plt.title("Recommended Items in each Division")
ax1 = plt.legend()

ax2 = plt.subplot2grid((2, 2), (0, 1))
ax2 = plt.xticks(rotation=45)
ax2 = plt.hist(rd['Department'], color="green", alpha = 0.5, label = "Recommended")
ax2 = plt.hist(nrd['Department'], color="yellow", alpha = 0.5, label = "Not Recommended")
ax2 = plt.title("Recommended Items in each Department")
ax2 = plt.legend()

ax3 = plt.subplot2grid((2, 2), (1, 0), colspan=2)
ax3 = plt.xticks(rotation=45)
ax3 = plt.hist(rd['Product_Category'], color="blue", alpha = 0.5, label = "Recommended")
ax3 = plt.hist(nrd['Product_Category'], color="cyan", alpha = 0.5, label = "Not Recommended")
ax3 = plt.title("Recommended Items in each Category")
ax3 = plt.legend()

In [1]:
df['Review Length'] = df['Review'].astype(str).apply(len)
df.head()

In [1]:
fig = plt.figure(figsize=(15, 10))
ax = sns.distplot(df['Review Length'], color="red")
ax = plt.title("Length of Reviews")

In [1]:
plt.figure(figsize=(20,10))
sns.boxplot(x = 'Age', y = 'Review Length', data = df)

In [1]:
plt.style.use('ggplot')

fig = plt.figure(figsize=(18, 18))
ax1 = plt.subplot2grid((2, 2), (0, 0))
ax1 = plt.xticks(rotation=45)
ax1 = sns.boxplot(x = 'Division', y = 'Review Length', data = df)
ax1 = plt.title("Review Length in each Division")

ax2 = plt.subplot2grid((2, 2), (0, 1))
ax2 = plt.xticks(rotation=45)
ax2 = sns.boxplot(x = 'Department', y = 'Review Length', data = df)
ax2 = plt.title("Review Length in each Department")

ax3 = plt.subplot2grid((2, 2), (1, 0), colspan=2)
ax3 = plt.xticks(rotation=45)
ax3 = sns.boxplot(x = 'Product_Category', y = 'Review Length', data = df)
ax3 = plt.title("Review Length in each Category")

In [1]:
plt.figure(figsize=(10,10))
sns.boxplot(x = 'Rating', y = 'Pos_Feedback_Cnt', data = df)

In [1]:
# wordcloud for Title
w=df['Review_Title'].str.cat(sep=' ')
# Create the wordcloud object
wordcloud = WordCloud(background_color="yellow",width=800, height=480, margin=0).generate(w)
 
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

In [1]:
# wordcloud for Review Text
w1=df['Review'].str.cat(sep=' ')
# Create the wordcloud object
wordcloud = WordCloud(background_color="white",width=800, height=480, margin=0, colormap='gist_heat').generate(w1)
 
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

In [1]:
conv=df.cov()
conv

In [1]:
sns.heatmap(conv,annot=True)

In [1]:
import re

def clean_data(text):
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = letters_only.lower().split()                            
    return( " ".join( words )) 

from wordcloud import WordCloud, STOPWORDS
stopwords= set(STOPWORDS)|{'skirt', 'blouse','dress','sweater', 'shirt','bottom', 'pant', 'pants' 'jean', 'jeans','jacket', 'top', 'dresse'}

def create_cloud(rating):
    x= [i for i in rating]
    y= ' '.join(x)
    cloud = WordCloud(background_color='white',width=1600, height=800,max_words=100,stopwords= stopwords).generate(y)
    plt.figure(figsize=(15,7.5))
    plt.axis('off')
    plt.imshow(cloud)
    plt.show()

In [1]:
rating5= df[df['Rating']==5]['Review'].apply(clean_data)
create_cloud(rating5)

In [1]:
rating4= df[df['Rating']==4]['Review'].apply(clean_data)
create_cloud(rating4)

In [1]:
rating3= df[df['Rating']==3]['Review'].apply(clean_data)
create_cloud(rating3)

In [1]:
rating2=df[df['Rating']==2]['Review'].apply(clean_data)
create_cloud(rating2)

In [1]:
rating1=df[df['Rating']==1]['Review'].apply(clean_data)
create_cloud(rating1)

In [1]:
df.columns

In [1]:
copy1=df
copy2=df

In [1]:
def top_n_ngram(corpus,n = None,ngram=1):
    vec = CountVectorizer(stop_words = 'english',ngram_range=(ngram,ngram)).fit(corpus)
    
    bag_of_words = vec.transform(corpus) 
    
    sum_words = bag_of_words.sum(axis =0)
    
    words_freq = [(word,sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]

    words_freq = sorted(words_freq,key = lambda x:x[1],reverse = True)
    
    return words_freq[:n]

In [1]:
common_words = top_n_ngram(copy1['Review'], 20,1)

copy1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])

plt.figure(figsize =(10,5))

copy1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
kind='bar', title='Top 20 unigrams in review after removing stop words')

In [1]:
df.columns

In [1]:
common_words_2 = top_n_ngram(copy2['Review'], 20, 2)

copy2 = pd.DataFrame(common_words_2, columns = ['ReviewText' , 'count'])

plt.figure(figsize =(10,5))

copy2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
kind='bar', title='Top 20 bigrams in review after removing stop words')

# *** The End ***