# Importing the necessary packages

In [None]:
import pandas as pd
import numpy as np
import re
import os
!pip install spacy
import spacy
from tqdm import tqdm
import en_core_web_sm
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('all')
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
!pip install sentic
from sentic import SenticPhrase

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Package cess_cat is already up-to-date!
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Package cess_esp is already up-to-date!
[nltk_data]    | Downloading packag

# Historic dataset

In [None]:
hist= pd.read_excel('/content/drive/MyDrive/NLP/Case Study/Dataset/FORD_MUSTANG.xlsx')

hist.dropna(inplace=True)

def extract_relevant_model(df,model_type):
  # This function will extract the relevant brand and model type from the historic review dataset
  # df ----> dataframe
  # model_type ---> Model type to be extracted

  df["Flag"]= df["Vehicle_Title"].str.find(model_type)
  df1 = df.drop(df[df.Flag == -1].index)
  df1.drop(columns=['Flag'],inplace=True)
  return df1

hist1=extract_relevant_model(hist,'Convertible')

def date_extraction(df):
  #This function will extract the date from the Review_Date column
  # df ---> The dataframe from which we will separate extract date the Review_Date column
  
  df['Date'] = df['Review_Date'].str.extract(r"(\d\d/\d\d/\d\d \d\d:\d\d)", expand=True)
  df['Date']=pd.to_datetime(df['Date'])#format="%m/%d/%y")
  df.drop(columns='Review_Date',inplace=True)
  return df

hist2=date_extraction(hist1)

def hist_filtering(df,manufacturer):
  # This function is used to filter the historic data
  df.drop(columns=['Author_Name', 'Vehicle_Title', 'Review_Title','Rating'],inplace=True)
  df['manufacturer']=manufacturer
  return df

hist_b=hist2.copy()
hist3=hist_filtering(hist_b,'Ford')
historic=hist3.copy()
historic['Source'] = 'hist'

# Tweets

In [None]:
tweets=pd.read_csv('/content/drive/MyDrive/NLP/Case Study/Dataset/Ford-Mustang-Convertible-model.csv')

def filtering_tweets(df):
  # This function will take the tweets dataset and filter it accordingly
  # df --> dataframe
  df['Date'] = df['date'].str.extract(r"(\d\d\d\d-\d\d-\d\d \d\d:\d\d)", expand=True)
  df['Date']=pd.to_datetime(df['Date'])
  df.drop(columns=['date', 'username'],inplace=True)
  df.rename(columns={"content":"Review"},inplace=True)
  df1=df[['Review','Date','manufacturer']]
  return df1

twt=filtering_tweets(tweets)
twt['Source'] = 'twts'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# YouTube Comments

In [None]:
comments=pd.read_csv('/content/drive/MyDrive/NLP/Case Study/Dataset/Ford-Mustang-Convertible-YouTube.csv')

def filtering_comments(df):
  # This function will take the tweets dataset and filter it accordingly
  # df --> dataframe
  df['Date'] = df['DateTime'].str.extract(r"(\d\d\d\d-\d\d-\d\dT\d\d:\d\d)", expand=True)
  df['Date']=pd.to_datetime(df['Date'])
  df.drop(columns=['DateTime'],inplace=True)
  df.rename(columns={"Comment":"Review"},inplace=True)
  df1=df[['Review','Date']]
  return df1

cmm=filtering_comments(comments)
cmm['manufacturer'] = 'Ford'
cmm['Source'] = 'yout'

# Combining the historic dataset with the tweets and YouTube Comments

In [None]:
final_data = pd.concat([historic,twt,cmm]).reset_index(0,drop=True)
final_data.sort_values(by = 'Date', axis = 0, inplace = True)
final_data.reset_index(drop = True, inplace = True)
historic.shape,twt.shape,cmm.shape,final_data.shape

((678, 4), (2248, 4), (264, 4), (3190, 4))

# Cleaning the data

In [None]:
# Removing URLs
def remove_URL(headline_text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', headline_text)


# Removing the html tags
def remove_html(headline_text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',headline_text)

# Removing Pictures/Tags/Symbols/Emojis
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

import string

# Removing punctuations
def remove_punct(headline_text):
    table=str.maketrans('','',string.punctuation)
    return headline_text.translate(table)

final_data['Review']=final_data['Review'].apply(remove_URL)
final_data['Review'] = final_data['Review'].apply(remove_html)
final_data['Review']=final_data['Review'].apply(remove_emojis)
final_data['Review']=final_data['Review'].apply(remove_punct)


# Text Summariztion

In [None]:
sid = SentimentIntensityAnalyzer()
text = " ".join(review for review in final_data.Review)

## Score without summarization

In [20]:
x=sid.polarity_scores(text)
print('Compund \t -',x.get("compound"))
print('Negative \t -',x.get("neg"))
print('Neutral \t -',x.get("neu"))
print('Positive \t -',x.get("pos"))

Compund 	 - 1.0
Negative 	 - 0.037
Neutral 	 - 0.777
Positive 	 - 0.185


## sumy

In [None]:
pip install sumy

In [18]:
# Load Packages
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

# Creating text parser using tokenization
parser = PlaintextParser.from_string(text,Tokenizer("english"))

from sumy.summarizers.text_rank import TextRankSummarizer

# Summarize using sumy TextRank
summarizer = TextRankSummarizer()
summary =summarizer(parser.document,2)

text_summary=""
for sentence in summary:
    text_summary+=str(sentence)

print(text_summary)



In [19]:
x=sid.polarity_scores(text_summary)
print('Compund \t -',x.get("compound"))
print('Negative \t -',x.get("neg"))
print('Neutral \t -',x.get("neu"))
print('Positive \t -',x.get("pos"))

Compund 	 - 1.0
Negative 	 - 0.05
Neutral 	 - 0.74
Positive 	 - 0.21


## Hugging Face 

In [None]:
#!pip install transformers
from transformers import pipeline

# Initialize the HuggingFace summarization pipeline
summarizer = pipeline("summarization")
summarized = summarizer(text)

# Print summarized text
print(summarized)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)
Token indices sequence length is longer than the specified maximum sequence length for this model (149582 > 1024). Running this sequence through the model will result in indexing errors


IndexError: ignored

The hugging face is unable to process huge amount of data

## Gensim

In [None]:
pip install gensim

In [None]:
from gensim.summarization.summarizer import summarize

# Summarize text using gensim
gen_summary=summarize(text)
print(gen_summary)

We  2004 Ford Mustang convertible 2004 2004 Ford Mustang convertible condition excellentcylinders 6…   well he said that the rear seats will be fine for me cause m a kid butt wait i 5ft 8inn no way 1971 FORD MUSTANG QR Code Link to This Post 1971 FORD MUSTANG CONVERTIBLE MACH 1 HOOD amp STRIPES amp…   Loving the fact bbceastenders featured a red Mustang Convertible for the Mitchell sisters finale Great choice Mustang Ford Its cold outside  Dreaming of summer in a convertible Ford Mustang  HotWheels Red 65 Mustang Convertible 1982 NIP vintage car ford collectible   1971 Ford Mustang Convertible No Reserve 1971 For
Why do the British models not have Sync 3 yet This week Ive been driving the Ford Mustang EcoBoost €61500 for the convertible  1996 Ford Mustang GT Convertible 46L Hot Rod QR Code Link to This Post This car I am selling is a 1996…   1972 Ford Mustang Convertible 351 V8 4speed Docum
Mustang convertible spotted in Sydney today First time Ive seen one Ford Classic  I39ve just dri

In [None]:
x=sid.polarity_scores(gen_summary)
print('Compund \t -',x.get("compound"))
print('Negative \t -',x.get("neg"))
print('Neutral \t -',x.get("neu"))
print('Positive \t -',x.get("pos"))

Compund 	 - 1.0
Negative 	 - 0.041
Neutral 	 - 0.766
Positive 	 - 0.193


# <h2> <center><U> INFERENCE </U></CENTER></H2>

1. Loading the necessary packages
2. Loading and processing the historic,tweets and youtube comments.
3. Merging 3 different datasets
4. Cleaning the data by removing the stopwords,punctuations, emojis.
5. Extracting the reviews into one variable for text summarization. 
6. Firstly, I have directly got the sentiment score without summarization to perform comparative study. 
7. Performed text summarization using sumy,hugging face and gensim. The observations is as follows :- 

The Gensim is taking 46 seconds to summarize the data.The sentiment score for this summarized data was processed in 7mins. Whereas the Hugging face is giving <b>index out of range in self.</b> Implies the Gensim text summarization is working better than the Hugging Face. The summy took 1min to summarize the text. In 3mins I have got the sentiment score. 

## Comparitive Study

1. <h3>Sentiment Score without summarization</h3>
          
          Compund 	 - 1.0
          Negative 	 - 0.037
          Neutral 	 - 0.777
          Positive 	 - 0.185

2. Sentiment Score with sumy (text summarization)

          Compund 	 - 1.0
          Negative 	 - 0.05
          Neutral 	 - 0.74
          Positive 	 - 0.21

3. Sentiment Score with Gensim (text summarization)

          Compund 	 - 1.0
          Negative 	 - 0.041
          Neutral 	 - 0.766
          Positive 	 - 0.193


We can observe that even after text summarization, we are getting similar sentiment scores. But to be very precise, Gensim is giving the sentiment score closer to the sentiment score before text summarization. So, I conclude by telling that, for this specific scenario, Gensim is the best text summarizer compare to sumy and hugging face. 