## Importing

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# <span>View dataset</span>

In [69]:
train = pd.read_csv("./dataset/train_data.csv")
test = pd.read_csv("./dataset/test_data.csv")

In [70]:
train.count()

name                 4000
brand                4000
categories           4000
primaryCategories    4000
reviews.date         4000
reviews.text         4000
reviews.title        3990
sentiment            4000
dtype: int64

<p>We are able to observe that there are 4000 entries in the train dataset, for each column except <strong style="color: red;">reviews.title</strong>. This indicates that there are 10 missing or null entries, which should be treated in the later sections.</p>


In [71]:
test.count()

name                 1000
brand                1000
categories           1000
primaryCategories    1000
reviews.date         1000
reviews.text         1000
reviews.title         997
dtype: int64

We are able to observe that there are 1000 entries in the test dataset, for each column except <strong style="color: red;">reviews.title</strong>. This indicates that there are 3 missing or null entries, which should be treated in the later sections.

In [72]:
train.describe()

Unnamed: 0,name,brand,categories,primaryCategories,reviews.date,reviews.text,reviews.title,sentiment
count,4000,4000,4000,4000,4000,4000,3990,4000
unique,23,1,23,4,638,3598,2606,3
top,Amazon Echo Show Alexa-enabled Bluetooth Speak...,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Electronics,2017-01-23T00:00:00.000Z,I bought this kindle for my 11yr old granddaug...,Great tablet,Positive
freq,676,4000,628,2600,99,4,100,3749


In [73]:
test.describe()

Unnamed: 0,name,brand,categories,primaryCategories,reviews.date,reviews.text,reviews.title
count,1000,1000,1000,1000,1000,1000,997
unique,23,1,23,4,366,979,796
top,Amazon Echo Show Alexa-enabled Bluetooth Speak...,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Electronics,2017-01-23T00:00:00.000Z,I bought the white version and have it in the ...,Great tablet
freq,169,1000,169,676,26,2,22


In [74]:
train.head()

Unnamed: 0,name,brand,categories,primaryCategories,reviews.date,reviews.text,reviews.title,sentiment
0,"All-New Fire HD 8 Tablet, 8"" HD Display, Wi-Fi...",Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Electronics,2016-12-26T00:00:00.000Z,Purchased on Black FridayPros - Great Price (e...,Powerful tablet,Positive
1,Amazon - Echo Plus w/ Built-In Hub - Silver,Amazon,"Amazon Echo,Smart Home,Networking,Home & Tools...","Electronics,Hardware",2018-01-17T00:00:00.000Z,I purchased two Amazon in Echo Plus and two do...,Amazon Echo Plus AWESOME,Positive
2,Amazon Echo Show Alexa-enabled Bluetooth Speak...,Amazon,"Amazon Echo,Virtual Assistant Speakers,Electro...","Electronics,Hardware",2017-12-20T00:00:00.000Z,Just an average Alexa option. Does show a few ...,Average,Neutral
3,"Fire HD 10 Tablet, 10.1 HD Display, Wi-Fi, 16 ...",Amazon,"eBook Readers,Fire Tablets,Electronics Feature...","Office Supplies,Electronics",2017-08-04T00:00:00.000Z,"very good product. Exactly what I wanted, and ...",Greattttttt,Positive
4,"Brand New Amazon Kindle Fire 16gb 7"" Ips Displ...",Amazon,"Computers/Tablets & Networking,Tablets & eBook...",Electronics,2017-01-23T00:00:00.000Z,This is the 3rd one I've purchased. I've bough...,Very durable!,Positive


In [75]:
test.head()

Unnamed: 0,name,brand,categories,primaryCategories,reviews.date,reviews.text,reviews.title
0,"Fire Tablet, 7 Display, Wi-Fi, 16 GB - Include...",Amazon,"Fire Tablets,Computers/Tablets & Networking,Ta...",Electronics,2016-05-23T00:00:00.000Z,Amazon kindle fire has a lot of free app and c...,very handy device
1,Amazon Echo Show Alexa-enabled Bluetooth Speak...,Amazon,"Computers,Amazon Echo,Virtual Assistant Speake...","Electronics,Hardware",2018-01-02T00:00:00.000Z,The Echo Show is a great addition to the Amazo...,Another winner from Amazon
2,"All-New Fire HD 8 Tablet, 8"" HD Display, Wi-Fi...",Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Electronics,2017-01-02T00:00:00.000Z,Great value from Best Buy. Bought at Christmas...,simple to use and reliable so far
3,"Brand New Amazon Kindle Fire 16gb 7"" Ips Displ...",Amazon,"Computers/Tablets & Networking,Tablets & eBook...",Electronics,2017-03-25T00:00:00.000Z,"I use mine for email, Facebook ,games and to g...",Love it!!!
4,Amazon Echo Show Alexa-enabled Bluetooth Speak...,Amazon,"Computers,Amazon Echo,Virtual Assistant Speake...","Electronics,Hardware",2017-11-15T00:00:00.000Z,This is a fantastic item & the person I bought...,Fantastic!


# <span>1.1 Cleaning and preparing dataset</span>

### <strong style="color: blue;">1.1a Clean the data of errors and inconsistencies</strong>

#### <span >1.1a (i) Dealing with duplicates</span>

In [76]:
# Viewing train duplicates
print("Number of duplicates in train dataset: " + str(train.duplicated().sum()))

Number of duplicates in train dataset: 58


In [77]:
# Viewing test duplicates
print("Number of duplicates in test dataset: " + str(test.duplicated().sum()))

Number of duplicates in test dataset: 3


In [78]:
# Dropping train & test duplicates
train = train.drop_duplicates().reset_index(drop=True)
test = test.drop_duplicates().reset_index(drop=True)

In [79]:
# View the train dataset after dropping (dropped 58 entries)
train.describe()

Unnamed: 0,name,brand,categories,primaryCategories,reviews.date,reviews.text,reviews.title,sentiment
count,3942,3942,3942,3942,3942,3942,3932,3942
unique,23,1,23,4,638,3598,2606,3
top,Amazon Echo Show Alexa-enabled Bluetooth Speak...,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Electronics,2017-01-23T00:00:00.000Z,I bought this kindle for my 11yr old granddaug...,Great tablet,Positive
freq,676,3942,628,2562,98,4,99,3694


In [80]:
# View the test dataset after dropping (dropped 3 entries)
test.describe()

Unnamed: 0,name,brand,categories,primaryCategories,reviews.date,reviews.text,reviews.title
count,997,997,997,997,997,997,994
unique,23,1,23,4,366,979,796
top,Amazon Echo Show Alexa-enabled Bluetooth Speak...,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Electronics,2017-01-23T00:00:00.000Z,Got this tablet for my 7yr old daughter and it...,Great tablet
freq,169,997,169,673,26,2,22


### <strong style="color: blue;">1.1b Deal with missing values and outliers</strong>

#### <span >1.1b (i) Dealing with missing or null values</span>

In [81]:
train.isnull().sum()

name                  0
brand                 0
categories            0
primaryCategories     0
reviews.date          0
reviews.text          0
reviews.title        10
sentiment             0
dtype: int64

In [82]:
test.isnull().sum()

name                 0
brand                0
categories           0
primaryCategories    0
reviews.date         0
reviews.text         0
reviews.title        3
dtype: int64

In [83]:
train.dropna(axis=0, inplace = True)
test.dropna(axis=0, inplace= True)

In [84]:
train.isnull().sum()

name                 0
brand                0
categories           0
primaryCategories    0
reviews.date         0
reviews.text         0
reviews.title        0
sentiment            0
dtype: int64

In [85]:
test.isnull().sum()

name                 0
brand                0
categories           0
primaryCategories    0
reviews.date         0
reviews.text         0
reviews.title        0
dtype: int64

In [86]:
train.describe()

Unnamed: 0,name,brand,categories,primaryCategories,reviews.date,reviews.text,reviews.title,sentiment
count,3932,3932,3932,3932,3932,3932,3932,3932
unique,23,1,23,4,630,3590,2606,3
top,Amazon Echo Show Alexa-enabled Bluetooth Speak...,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Electronics,2017-01-23T00:00:00.000Z,I bought this kindle for my 11yr old granddaug...,Great tablet,Positive
freq,667,3932,628,2561,98,4,99,3684


In [87]:
test.describe()

Unnamed: 0,name,brand,categories,primaryCategories,reviews.date,reviews.text,reviews.title
count,994,994,994,994,994,994,994
unique,23,1,23,4,363,976,796
top,"All-New Fire HD 8 Tablet, 8"" HD Display, Wi-Fi...",Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Electronics,2017-01-23T00:00:00.000Z,A great new Amazon device to add to my other A...,Great tablet
freq,169,994,169,673,26,2,22


#### <span >1.1b (ii) Dealing with outliers</span>

In [88]:
train.dtypes

name                 object
brand                object
categories           object
primaryCategories    object
reviews.date         object
reviews.text         object
reviews.title        object
sentiment            object
dtype: object

In [89]:
test.dtypes

name                 object
brand                object
categories           object
primaryCategories    object
reviews.date         object
reviews.text         object
reviews.title        object
dtype: object

<p>The IQR method looks at the middle 50% of the data. Anything outside of 1.5 * IQR from the 1st and 3rd quartiles is considered an outlier. However since there all the types of the columns are of object type not numerical we don't deal with outliers.</p>

### <strong style="color: blue;">1.1c Transform the data into a format suitable for analysis and processing</strong>

#### <span >1.1c (i) Imputations</span>

In [92]:
train.dtypes

name                              object
brand                             object
categories                        object
primaryCategories                 object
reviews.date         datetime64[ns, UTC]
reviews.text                      object
reviews.title                     object
sentiment                         object
review_year                        int32
dtype: object

<p>No Imputations needed since there are no numerical entries </p>

#### <span >1.1c (ii) Adding/removing columns</span>

In [90]:
# Ensure 'reviews.date' is in datetime format
train['reviews.date'] = pd.to_datetime(train['reviews.date'])

# Create a new column 'review_year' by extracting the year from 'reviews.date'
train['review_year'] = train['reviews.date'].dt.year


In [91]:
train.head()

Unnamed: 0,name,brand,categories,primaryCategories,reviews.date,reviews.text,reviews.title,sentiment,review_year
0,"All-New Fire HD 8 Tablet, 8"" HD Display, Wi-Fi...",Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Electronics,2016-12-26 00:00:00+00:00,Purchased on Black FridayPros - Great Price (e...,Powerful tablet,Positive,2016
1,Amazon - Echo Plus w/ Built-In Hub - Silver,Amazon,"Amazon Echo,Smart Home,Networking,Home & Tools...","Electronics,Hardware",2018-01-17 00:00:00+00:00,I purchased two Amazon in Echo Plus and two do...,Amazon Echo Plus AWESOME,Positive,2018
2,Amazon Echo Show Alexa-enabled Bluetooth Speak...,Amazon,"Amazon Echo,Virtual Assistant Speakers,Electro...","Electronics,Hardware",2017-12-20 00:00:00+00:00,Just an average Alexa option. Does show a few ...,Average,Neutral,2017
3,"Fire HD 10 Tablet, 10.1 HD Display, Wi-Fi, 16 ...",Amazon,"eBook Readers,Fire Tablets,Electronics Feature...","Office Supplies,Electronics",2017-08-04 00:00:00+00:00,"very good product. Exactly what I wanted, and ...",Greattttttt,Positive,2017
4,"Brand New Amazon Kindle Fire 16gb 7"" Ips Displ...",Amazon,"Computers/Tablets & Networking,Tablets & eBook...",Electronics,2017-01-23 00:00:00+00:00,This is the 3rd one I've purchased. I've bough...,Very durable!,Positive,2017


#### <span >1.1c (iii) Renaming columns</span>

In [None]:

# a. Explore the data to understand its distribution, relationships between variables, and note any outliers.

# Exploring the distribution of the sentiment variable
sentiment_distribution = train['sentiment'].value_counts()
print("Sentiment Distribution:\n", sentiment_distribution)

# Exploring the distribution of brands and categories
brand_distribution = train['brand'].value_counts()
category_distribution = train['primaryCategories'].value_counts()

# Checking for outliers by analyzing review lengths
train['review_length'] = train['reviews.text'].apply(len)
outliers = train[train['review_length'] > train['review_length'].mean() + 3 * train['review_length'].std()]
print("Outliers based on review length:\n", outliers[['name', 'review_length']])

# Relationships between sentiment and other variables (correlation)
import seaborn as sns
import matplotlib.pyplot as plt

# Visualizing relationships between sentiment and review length
plt.figure(figsize=(10, 6))
sns.boxplot(x='sentiment', y='review_length', data=train)
plt.title('Review Length by Sentiment')
plt.show()


In [None]:

# b. Descriptive Statistics

# Summary statistics for review length
review_length_stats = train['review_length'].describe()
print("Review Length Statistics:\n", review_length_stats)

# Central tendency for sentiment
sentiment_mode = train['sentiment'].mode()[0]
sentiment_counts = train['sentiment'].value_counts()
print(f"Most frequent sentiment: {sentiment_mode}")
print("Sentiment counts:\n", sentiment_counts)

# Variability in review length (Standard deviation and variance)
review_length_variability = train['review_length'].var(), train['review_length'].std()
print(f"Review Length Variance: {review_length_variability[0]}, Standard Deviation: {review_length_variability[1]}")

# Visualization of sentiment counts
plt.figure(figsize=(8, 5))
sns.countplot(x='sentiment', data=train)
plt.title('Sentiment Distribution')
plt.show()


In [None]:

# c. Inferential Statistics

# Using Chi-square test to evaluate the relationship between sentiment and brand
from scipy.stats import chi2_contingency

# Create a contingency table for brand and sentiment
contingency_table = pd.crosstab(train['brand'], train['sentiment'])

# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Display the results of the chi-square test
print(f"Chi-square Statistic: {chi2}, p-value: {p}, Degrees of Freedom: {dof}")

# Interpretation
if p < 0.05:
    print("There is a statistically significant relationship between brand and sentiment.")
else:
    print("There is no statistically significant relationship between brand and sentiment.")


In [None]:

# a. (*) Inconsistencies, Missing Values, Limitations, and Biases

# Checking for missing values in the dataset
missing_values = train.isnull().sum()
print("Missing values per column:\n", missing_values)

# Describe inconsistencies and missing values
# The 'reviews.title' column has some missing values as identified earlier. These will need to be addressed depending on the downstream analysis.
# For simplicity, we could either drop these rows or impute them based on available data, but for now, no action was taken.
# In addition, some reviews might be biased, with extreme lengths or reviews that contain irrelevant text, which should be cleaned.

# Checking for inconsistencies in the 'sentiment' and 'reviews.text' columns
invalid_sentiments = train[~train['sentiment'].isin(['Positive', 'Neutral', 'Negative'])]
if not invalid_sentiments.empty:
    print(f"Invalid sentiments detected: {len(invalid_sentiments)}")
else:
    print("No invalid sentiments detected.")

# Preprocessing challenges:
# - Inconsistent formatting in the review text and dates across records were noted during the cleaning phase.
# - The challenge is determining whether specific categories or brands are under-represented, leading to potential bias in the model.
# - We also noted that review lengths vary considerably, which may indicate some reviews contain unnecessary filler content or repeated information.

# Handling missing values (if needed)
# As part of preprocessing, missing values in reviews.title can be imputed or the rows can be dropped:
# train.dropna(subset=['reviews.title'], inplace=True)
