# Imports

In [None]:
import os
import io
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rc('axes',labelsize=14)
mpl.rc('xtick',labelsize=12)
mpl.rc('ytick',labelsize=12)

# Heler Functions

In [None]:
#Setup Figure Saving
def save_this( fig_name, tight_layout=True, fig_extension = "png", resolution = 300):
  fig_dir_path = os.path.join(".","figs")
  os.makedirs(fig_dir_path, exist_ok=True)
  fig_full_path = os.path.join(fig_dir_path, fig_name + '.' + fig_extension)
  print("saving figure", fig_name)
  if tight_layout:
    plt.tight_layout()
  plt.savefig(fig_full_path,dpi=resolution,format=fig_extension)

# For Model metrics

# Preprocessing functions

In [None]:
## Loads data from uploaded files in colab or mounted
## Can use local filepath for local developement
def load_data_csv(file_name, upload=False):
  if(upload):
    RunningInCOLAB = 'google.colab' in str(get_ipython())
    if RunningInCOLAB:
      from google.colab import files
      uploaded = files.upload()
      file_path = io.BytesIO(uploaded[file_name])
    else:
      print('Running on Local machine: Specify file absolute path')
  else:
     file_path = file_name

  return pd.read_csv(file_path)

def load_data_json(file_name, upload=False):
  if(upload):
    RunningInCOLAB = 'google.colab' in str(get_ipython())
    if RunningInCOLAB:
      from google.colab import files
      uploaded = files.upload()
      file_path = io.BytesIO(uploaded[file_name])
    else:
      print('Running on Local machine: Specify file absolute path')
  else:
     file_path = file_name

  return pd.read_json(file_path)

def drop_columns(data, columns):
  data.drop(columns, axis=1, inplace=True)
  return data

def cleanup_scrapping_data(data):
  drop_columns_list = ['price', 'title', 'likes', 'dislikes']
  raname_dict = {'name':'Product Name', 'review':'Reviews', 'rating': 'Rating'}
  
  data = drop_columns(data, drop_columns_list)
  data = data.rename(columns=raname_dict)

  return data

## Function for remaping columns data
def add_sentiment_target(data, ref_colm):
  def sentiment(x):
    if(x==1 or x==2):
      return 'NEGATIVE'
    elif(x==4 or x==5):
      return 'POSITIVE'
    else:
      return 'NEUTRAL'

  data['Sentiment'] = data[ref_colm].apply(lambda x: sentiment(x))

  return data


## NLP Preprocessing

In [None]:
import re
import nltk.corpus
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words_list = stopwords.words('english')

def remove_punctuation(text):
    text = re.sub(r"(@\[a-z0-9.,]+)|([^0-9a-z., \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    return text

def remove_stop_word(text):
    text = " ".join([word for word in text.split() if word not in (stop_words_list)])
    return text

def remove_newlines(text):
    text. replace("\n"," ")
    return text

def contractions(text):
    text = re.sub(r"won’t", "will not", text)
    text = re.sub(r"can\’t", "can not", text)
    text = re.sub(r"n\’t", " not", text)
    text = re.sub(r"\’re", " are", text)
    text = re.sub(r"\’s", " is", text)
    text = re.sub(r"\’ll", " will", text)
    text = re.sub(r"\’t", " not", text)
    text = re.sub(r"\’ve", " have", text)
    text = re.sub(r"\’m", " am", text)
    return text

def nlp_preprocessing(data):
    data['Reviews']= data['Reviews'].apply(lambda x: x.lower())
    data['Reviews']= data['Reviews'].apply(lambda x: remove_newlines(x))
    data['Reviews']= data['Reviews'].apply(lambda x: remove_punctuation(x))
    data['Reviews']= data['Reviews'].apply(lambda x: BeautifulSoup(x).get_text())
    data['Reviews']= data['Reviews'].apply(lambda x: remove_stop_word(x))
    return data

def data_cleanup(data):
    data['Reviews'] = data['Reviews'].astype('str')
    data_sentiment = add_sentiment_target(data, 'Rating')
    data_preprocessed = nlp_preprocessing(data_sentiment)

    ## drop None and empty Reviews
    data_preprocessed.dropna(how = 'all', subset=['Reviews'], inplace=True)
    data_preprocessed[data_preprocessed['Reviews'].str.strip().astype(bool)]

    ## drop dublicates
    data_preprocessed.drop_duplicates(['Reviews'], inplace=True)
    return data_preprocessed

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Data Load

## Load and cleanup columns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load Data
amazon_mobile_reviews = load_data_csv('/content/drive/Shareddrives/projects_data/Sentiment-Analysis/Datasets/raw/Amazon_Unlocked_Mobile.csv')
flipkar_mobile_reviews_1 = load_data_json('/content/drive/Shareddrives/projects_data/Sentiment-Analysis/Datasets/raw/FK_203148.json')
flipkar_mobile_reviews_2 = load_data_json('/content/drive/Shareddrives/projects_data/Sentiment-Analysis/Datasets/raw/FK_216754.json')

In [None]:
flipkar_mobile_reviews_1.info()
flipkar_mobile_reviews_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203148 entries, 0 to 203147
Data columns (total 7 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   name      203148 non-null  object
 1   price     203148 non-null  object
 2   title     203148 non-null  object
 3   review    203148 non-null  object
 4   rating    203148 non-null  int64 
 5   likes     203148 non-null  int64 
 6   dislikes  203148 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 10.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216754 entries, 0 to 216753
Data columns (total 7 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   name      216754 non-null  object
 1   price     216754 non-null  object
 2   title     216754 non-null  object
 3   review    216754 non-null  object
 4   rating    216754 non-null  int64 
 5   likes     216754 non-null  int64 
 6   dislikes  216754 non-null  int64 
dtypes: int64(3), object(4)

In [None]:
pd.concat([flipkar_mobile_reviews_1, flipkar_mobile_reviews_2], axis=0)

Unnamed: 0,name,price,title,review,rating,likes,dislikes
0,"SAMSUNG Galaxy M01 (Black, 32 GB)","₹7,790",Really Nice,I have recently gifted this cell to my Dad... ...,4,776,224
1,"SAMSUNG Galaxy M01 (Black, 32 GB)","₹7,790",Awesome,I am truly satisfied with the performance of t...,5,222,86
2,"SAMSUNG Galaxy M01 (Black, 32 GB)","₹7,790",Good quality product,I gifted it to my mom... It is good for normal...,4,45,7
3,"SAMSUNG Galaxy M01 (Black, 32 GB)","₹7,790",Awesome,Good phone extremely liked it good perfomance ...,5,52,9
4,"SAMSUNG Galaxy M01 (Black, 32 GB)","₹7,790",Does the job,Phone is good Only For Simple Purpose still it...,3,62,14
...,...,...,...,...,...,...,...
216749,"SAMSUNG Galaxy On Nxt (Black, 16 GB)","₹9,499",Good quality product,The mobile is good but battery still working ...,4,0,0
216750,"SAMSUNG Galaxy On Nxt (Black, 16 GB)","₹9,499",Terrific purchase,nice and smooth mobile value for money,5,0,0
216751,"SAMSUNG Galaxy On Nxt (Black, 16 GB)","₹9,499",Awesome,awesome,5,0,0
216752,"SAMSUNG Galaxy On Nxt (Black, 16 GB)","₹9,499",Value-for-money,Good,4,0,0


### Save flipkart data in CSV

In [None]:
flipkar_mobile_reviews_1.to_csv('/content/drive/Shareddrives/projects_data/Sentiment-Analysis/Datasets/raw/FK_203148.csv')
flipkar_mobile_reviews_2.to_csv('/content/drive/Shareddrives/projects_data/Sentiment-Analysis/Datasets/raw/FK_216754.csv')

### Amazon Raw data- Remove duplicates

In [None]:
amazon_mobile_reviews = drop_columns(amazon_mobile_reviews, ['Brand Name', 'Review Votes', 'Price'])
amazon_mobile_reviews.drop_duplicates(['Reviews'], inplace=True)
amazon_mobile_reviews.to_csv('/content/drive/Shareddrives/projects_data/Sentiment-Analysis/Datasets/raw/amazon_reviews_no_duplicates.csv')

In [None]:
amazon_mobile_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413840 entries, 0 to 413839
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Product Name  413840 non-null  object 
 1   Brand Name    348669 non-null  object 
 2   Price         407907 non-null  float64
 3   Rating        413840 non-null  int64  
 4   Reviews       413778 non-null  object 
 5   Review Votes  401544 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 18.9+ MB


### Flipkart Raw data - Remove dublicates

In [None]:
flipkart_mobile_reviews = pd.concat([flipkar_mobile_reviews_1, flipkar_mobile_reviews_2], axis=0)
flipkart_mobile_reviews= cleanup_scrapping_data(flipkart_mobile_reviews)
flipkart_mobile_reviews.drop_duplicates(['Reviews'], inplace=True)
flipkart_mobile_reviews.to_csv('/content/drive/Shareddrives/projects_data/Sentiment-Analysis/Datasets/raw/flipkart_reviews_no_duplicates.csv')

In [None]:
flipkart_mobile_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140437 entries, 0 to 216023
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Product Name  140437 non-null  object
 1   Reviews       140437 non-null  object
 2   Rating        140437 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 4.3+ MB


### Merge Amazon and Flipkart

In [None]:
all_mobile_reviews = pd.concat([flipkart_mobile_reviews, amazon_mobile_reviews], axis=0)
all_mobile_reviews.to_csv('/content/drive/Shareddrives/projects_data/Sentiment-Analysis/Datasets/raw/all_mobiles_reviews_no_duplicates.csv')


### Preprocess Data

In [None]:
all_mobile_reviews_processed = data_cleanup(all_mobile_reviews)
amazon_review_processed = data_cleanup(amazon_mobile_reviews)
flipkart_review_processed = data_cleanup(flipkart_mobile_reviews)

  ' Beautiful Soup.' % markup)


In [None]:
all_mobile_reviews_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 279743 entries, 0 to 413829
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Product Name  279743 non-null  object
 1   Reviews       279743 non-null  object
 2   Rating        279743 non-null  int64 
 3   Sentiment     279743 non-null  object
dtypes: int64(1), object(3)
memory usage: 10.7+ MB


In [None]:
amazon_review_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 155477 entries, 0 to 413829
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Product Name  155477 non-null  object
 1   Rating        155477 non-null  int64 
 2   Reviews       155477 non-null  object
 3   Sentiment     155477 non-null  object
dtypes: int64(1), object(3)
memory usage: 5.9+ MB


In [None]:
flipkart_review_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125853 entries, 0 to 216023
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Product Name  125853 non-null  object
 1   Reviews       125853 non-null  object
 2   Rating        125853 non-null  int64 
 3   Sentiment     125853 non-null  object
dtypes: int64(1), object(3)
memory usage: 4.8+ MB


## Save Data

In [None]:
all_mobile_reviews_processed.to_csv('/content/drive/Shareddrives/projects_data/Sentiment-Analysis/Datasets/processed/mobile_reviews_processed.csv')
amazon_review_processed.to_csv('/content/drive/Shareddrives/projects_data/Sentiment-Analysis/Datasets/processed/amazon_reviews_processed.csv')
flipkart_review_processed.to_csv('/content/drive/Shareddrives/projects_data/Sentiment-Analysis/Datasets/processed/flipkar_reviews_processed.csv')