# **Sentiment Analysis (Word2Vec)**





### **Review Data About DataScience**

In [56]:
corpus = [
    "Data science plays a pivotal role in extracting valuable insights from data.",
    "Machine learning algorithms are essential tools in data science.",
    "Data scientists are in high demand due to their expertise in analyzing data.",
    "Predictive analytics is a key component of data science.",
    "Data science helps businesses make data-driven decisions.",
    "Natural language processing is an exciting field within data science.",
    "Data visualization is crucial for conveying complex information.",
    "Big data technologies are transforming the way we handle information.",
    "Data cleaning is a fundamental step in the data science process.",
    "Data mining uncovers hidden patterns in large datasets."
]

corpus

['Data science plays a pivotal role in extracting valuable insights from data.',
 'Machine learning algorithms are essential tools in data science.',
 'Data scientists are in high demand due to their expertise in analyzing data.',
 'Predictive analytics is a key component of data science.',
 'Data science helps businesses make data-driven decisions.',
 'Natural language processing is an exciting field within data science.',
 'Data visualization is crucial for conveying complex information.',
 'Big data technologies are transforming the way we handle information.',
 'Data cleaning is a fundamental step in the data science process.',
 'Data mining uncovers hidden patterns in large datasets.']

### **`Preprocessing`**

### **Lowercase**

In [57]:
lowercase_corpus = []
for i in corpus:
   lowercase_corpus.append(i.lower())

lowercase_corpus

['data science plays a pivotal role in extracting valuable insights from data.',
 'machine learning algorithms are essential tools in data science.',
 'data scientists are in high demand due to their expertise in analyzing data.',
 'predictive analytics is a key component of data science.',
 'data science helps businesses make data-driven decisions.',
 'natural language processing is an exciting field within data science.',
 'data visualization is crucial for conveying complex information.',
 'big data technologies are transforming the way we handle information.',
 'data cleaning is a fundamental step in the data science process.',
 'data mining uncovers hidden patterns in large datasets.']

In [58]:
import pandas as pd
review = pd.DataFrame(lowercase_corpus,columns=['review'])

review

Unnamed: 0,review
0,data science plays a pivotal role in extractin...
1,machine learning algorithms are essential tool...
2,data scientists are in high demand due to thei...
3,predictive analytics is a key component of dat...
4,data science helps businesses make data-driven...
5,natural language processing is an exciting fie...
6,data visualization is crucial for conveying co...
7,big data technologies are transforming the way...
8,data cleaning is a fundamental step in the dat...
9,data mining uncovers hidden patterns in large ...


### **Tokenization**

In [32]:
for i in review['review'].index:
   x = review.loc[i,'review'].split()
   review.loc[i,'review'] = x

In [33]:
review.loc[1,'review']

['machine',
 'learning',
 'algorithms',
 'are',
 'essential',
 'tools',
 'in',
 'data',
 'science']

### **RE**

In [29]:
import re
for i in review['review'].index:
   x = re.sub('[^a-zA-Z]',' ',review['review'][i])

   review['review'][i] = x


In [31]:
review.loc[1,'review']

'machine learning algorithms are essential tools in data science '

### **Stopwords Removal**

In [37]:
import nltk
from nltk.corpus import stopwords

In [39]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stopword = stopwords.words('english')

stopword

In [41]:
review

Unnamed: 0,review
0,"[data, science, plays, a, pivotal, role, in, e..."
1,"[machine, learning, algorithms, are, essential..."
2,"[data, scientists, are, in, high, demand, due,..."
3,"[predictive, analytics, is, a, key, component,..."
4,"[data, science, helps, businesses, make, data,..."
5,"[natural, language, processing, is, an, exciti..."
6,"[data, visualization, is, crucial, for, convey..."
7,"[big, data, technologies, are, transforming, t..."
8,"[data, cleaning, is, a, fundamental, step, in,..."
9,"[data, mining, uncovers, hidden, patterns, in,..."


In [42]:
for i in review['review'].index:
  x = [i for i in review.loc[i,'review'] if i not in stopword ]
  review.loc[i,'review'] = x

In [43]:
review

Unnamed: 0,review
0,"[data, science, plays, pivotal, role, extracti..."
1,"[machine, learning, algorithms, essential, too..."
2,"[data, scientists, high, demand, due, expertis..."
3,"[predictive, analytics, key, component, data, ..."
4,"[data, science, helps, businesses, make, data,..."
5,"[natural, language, processing, exciting, fiel..."
6,"[data, visualization, crucial, conveying, comp..."
7,"[big, data, technologies, transforming, way, h..."
8,"[data, cleaning, fundamental, step, data, scie..."
9,"[data, mining, uncovers, hidden, patterns, lar..."


### **Lemmatization**

In [44]:
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

In [46]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [49]:
for i in review['review'].index:
   x = [lemma.lemmatize(i) for i in review['review'][i] ]
   review['review'][i] = ' '.join(x)


In [52]:
pure_doc = review

In [68]:
pure_doc

Unnamed: 0,review
0,data science play pivotal role extracting valu...
1,machine learning algorithm essential tool data...
2,data scientist high demand due expertise analy...
3,predictive analytics key component data science
4,data science help business make data driven de...
5,natural language processing exciting field wit...
6,data visualization crucial conveying complex i...
7,big data technology transforming way handle in...
8,data cleaning fundamental step data science pr...
9,data mining uncovers hidden pattern large data...


In [59]:
pure_doc['review'][0] , review['review'][0]

('data science play pivotal role extracting valuable insight data',
 'data science plays a pivotal role in extracting valuable insights from data.')

### **Sentiment Score**


In [60]:
from textblob import TextBlob

In [71]:
scores = []

In [72]:
for i in pure_doc['review']:
   x =  TextBlob(i)
   score = x.sentiment
   scores.append(score.polarity)

In [73]:
pure_doc['sentiment score'] = scores

In [75]:
pure_doc

Unnamed: 0,review,sentiment score
0,data science play pivotal role extracting valu...,0.5
1,machine learning algorithm essential tool data...,0.0
2,data scientist high demand due expertise analy...,0.0175
3,predictive analytics key component data science,0.0
4,data science help business make data driven de...,0.0
5,natural language processing exciting field wit...,0.2
6,data visualization crucial conveying complex i...,-0.15
7,big data technology transforming way handle in...,0.0
8,data cleaning fundamental step data science pr...,0.0
9,data mining uncovers hidden pattern large data...,0.02381


In [76]:
pure_doc['label'] = pure_doc['sentiment score'].apply(lambda x : "positive" if x>0.5 else( 'negative' if x<0.0 else "neutral") )

In [77]:
pure_doc

Unnamed: 0,review,sentiment score,label
0,data science play pivotal role extracting valu...,0.5,neutral
1,machine learning algorithm essential tool data...,0.0,neutral
2,data scientist high demand due expertise analy...,0.0175,neutral
3,predictive analytics key component data science,0.0,neutral
4,data science help business make data driven de...,0.0,neutral
5,natural language processing exciting field wit...,0.2,neutral
6,data visualization crucial conveying complex i...,-0.15,negative
7,big data technology transforming way handle in...,0.0,neutral
8,data cleaning fundamental step data science pr...,0.0,neutral
9,data mining uncovers hidden pattern large data...,0.02381,neutral


### **`Text to Vector`**


In [None]:
token = [ ]
for i in pure_doc['review']:
  x = i.split()
  token.append(x)

token

In [None]:
token

In [106]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=token,min_count=1,vector_size=9)

In [None]:
words = model.wv.key_to_index

vector = [model.wv[i] for i in words]

vector


In [108]:
df = pd.DataFrame(vector)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.005904,0.002706,0.057113,0.10017,-0.103296,-0.078852,0.071386,0.099604,-0.055552
1,-0.04183,0.082008,-0.016866,-0.050398,0.072868,-0.053996,-0.020342,0.031943,0.011165
2,-0.092115,-0.105027,0.081258,0.056298,0.075083,0.008562,0.070563,-0.037815,-0.0105
3,0.064078,-0.083562,-0.043714,-0.083478,-0.010336,0.106009,-0.081332,-0.025925,-0.021519
4,0.089825,-0.06584,0.000658,-0.052796,-0.106611,0.055772,-0.097413,-0.048886,-0.0003
5,-0.003271,-0.085147,0.106874,0.055266,0.102556,-0.090582,0.049886,-0.046006,0.009205
6,0.094475,-0.049584,0.050248,-0.075353,-0.039458,0.104493,-0.01756,0.003612,-0.045952
7,-0.085386,-0.016662,0.027492,-0.009886,0.061497,-0.03047,0.025104,0.060589,0.092769
8,-0.016144,-0.102294,0.048606,0.006382,0.082717,-0.008991,-0.029331,-0.097273,-0.009486
9,0.031375,0.059958,0.078501,-0.063298,0.020619,0.067834,-0.053373,-0.034496,0.075614
