In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Matplotlib is building the font cache; this may take a moment.


In [2]:
data = pd.read_csv('../artifacts/sentiment_analysis.csv')

In [3]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [None]:
##Data Preprocessing

In [4]:
data.shape

(7920, 3)

In [6]:
data.duplicated().sum()

np.int64(0)

In [7]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [None]:
##Text Preprocessing

In [8]:
import re
import string

convrt uppercase to lowercase

In [10]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [20]:
data["tweet"].tail(10)

7910    perfect match instagood applewatch red instagr...
7911    i am completely in love with the new iphone em...
7912    tune in turn on drop out  gtd in one app  mobi...
7913    ok so my galaxy crashed after one day now i ha...
7914    gain followers rt this must follow me i follow...
7915    live out loud lol liveoutloud selfie smile son...
7916    we would like to wish you an amazing day make ...
7917    helping my lovely  year old neighbor with her ...
7918    finally got my smart pocket wifi stay connecte...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, dtype: object

Remove links

In [13]:
data["tweet"] = data["tweet"].apply(
    lambda x: re.sub(r"http\S+|www\S+|https\S+", "", x)
)


In [15]:
data["tweet"] = data["tweet"].apply(
    lambda x: x.translate(str.maketrans("", "", string.punctuation))
)


In [19]:
data["tweet"] = data["tweet"].apply(lambda x: re.sub(r"\d+", "", x))


In [24]:
!pip install nltk



In [25]:
import nltk
from nltk.corpus import stopwords


In [26]:
nltk.download('stopwords', download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [28]:
nltk.data.path.append('../static/model')



In [29]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


In [30]:
data["tweet"] = data["tweet"].apply(
    lambda x: " ".join(word for word in x.split() if word.lower() not in stop_words)
)

print(data.head())


   id  label                                              tweet
0   1      0  fingerprint pregnancy test android apps beauti...
1   2      0  finally transparant silicon case thanks uncle ...
2   3      0  love would go talk makememories unplug relax i...
3   4      0  im wired know im george made way iphone cute d...
4   5      1  amazing service apple wont even talk question ...


In [31]:
from nltk.stem import PorterStemmer


In [32]:
stemmer = PorterStemmer()


In [33]:
data["tweet"] = data["tweet"].apply(
    lambda x: " ".join(stemmer.stem(word) for word in x.split())
)

print(data)


        id  label                                              tweet
0        1      0  fingerprint pregnanc test android app beauti c...
1        2      0  final transpar silicon case thank uncl yay son...
2        3      0  love would go talk makememori unplug relax iph...
3        4      0  im wire know im georg made way iphon cute dave...
4        5      1  amaz servic appl wont even talk question unles...
...    ...    ...                                                ...
7915  7916      0  live loud lol liveoutloud selfi smile soni mus...
7916  7917      0  would like wish amaz day make everi minut coun...
7917  7918      0  help love year old neighbor ipad morn made rea...
7918  7919      0  final got smart pocket wifi stay connect anyti...
7919  7920      0  appl barcelona appl store bcn barcelona travel...

[7920 rows x 3 columns]


In [34]:
print(data)

        id  label                                              tweet
0        1      0  fingerprint pregnanc test android app beauti c...
1        2      0  final transpar silicon case thank uncl yay son...
2        3      0  love would go talk makememori unplug relax iph...
3        4      0  im wire know im georg made way iphon cute dave...
4        5      1  amaz servic appl wont even talk question unles...
...    ...    ...                                                ...
7915  7916      0  live loud lol liveoutloud selfi smile soni mus...
7916  7917      0  would like wish amaz day make everi minut coun...
7917  7918      0  help love year old neighbor ipad morn made rea...
7918  7919      0  final got smart pocket wifi stay connect anyti...
7919  7920      0  appl barcelona appl store bcn barcelona travel...

[7920 rows x 3 columns]


In [36]:
from collections import Counter
volab = Counter()

In [37]:
volab

Counter()

In [40]:
data['tweet']

0       fingerprint pregnanc test android app beauti c...
1       final transpar silicon case thank uncl yay son...
2       love would go talk makememori unplug relax iph...
3       im wire know im georg made way iphon cute dave...
4       amaz servic appl wont even talk question unles...
                              ...                        
7915    live loud lol liveoutloud selfi smile soni mus...
7916    would like wish amaz day make everi minut coun...
7917    help love year old neighbor ipad morn made rea...
7918    final got smart pocket wifi stay connect anyti...
7919    appl barcelona appl store bcn barcelona travel...
Name: tweet, Length: 7920, dtype: object

In [41]:
for sentence in data['tweet']:
    volab.update(sentence.split())

In [47]:
len(volab)

15896

In [43]:
tokens = [key for key in volab if volab[key] > 10]

In [48]:
tokens

['test',
 'android',
 'app',
 'beauti',
 'cute',
 'health',
 'iger',
 'iphoneonli',
 'iphonesia',
 'iphon',
 'final',
 'case',
 'thank',
 'yay',
 'soni',
 'xperia',
 'love',
 'would',
 'go',
 'talk',
 'relax',
 'smartphon',
 'wifi',
 'connect',
 'im',
 'wire',
 'know',
 'made',
 'way',
 'home',
 'amaz',
 'servic',
 'appl',
 'wont',
 'even',
 'question',
 'pay',
 'stupid',
 'support',
 'softwar',
 'updat',
 'fuck',
 'phone',
 'big',
 'time',
 'happi',
 'us',
 'instap',
 'instadaili',
 'xperiaz',
 'new',
 'type',
 'c',
 'charger',
 'cabl',
 'uk',
 '…',
 'amazon',
 'etsi',
 'year',
 'cross',
 'young',
 'newyear',
 'start',
 'recip',
 'technolog',
 'samsunggalaxi',
 'iphonex',
 'shop',
 'listen',
 'music',
 'likeforlik',
 'photo',
 'fun',
 'selfi',
 'pool',
 'water',
 'camera',
 'picoftheday',
 'sun',
 'instagood',
 'boy',
 'outdoor',
 'hey',
 'make',
 'ipod',
 'dont',
 'color',
 'inch',
 'crash',
 'everi',
 'five',
 'ha',
 'need',
 'realli',
 'drop',
 'ball',
 'design',
 'give',
 'anoth',

In [49]:
def save_vocabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding="utf-8")
    file.write(data)
    file.close()
save_vocabulary(tokens, '../static/model/vocabulary.txt')
    

In [50]:
x = data['tweet']
y = data['label']

In [51]:
y

0       0
1       0
2       0
3       0
4       1
       ..
7915    0
7916    0
7917    0
7918    0
7919    0
Name: label, Length: 7920, dtype: int64

In [52]:
x

0       fingerprint pregnanc test android app beauti c...
1       final transpar silicon case thank uncl yay son...
2       love would go talk makememori unplug relax iph...
3       im wire know im georg made way iphon cute dave...
4       amaz servic appl wont even talk question unles...
                              ...                        
7915    live loud lol liveoutloud selfi smile soni mus...
7916    would like wish amaz day make everi minut coun...
7917    help love year old neighbor ipad morn made rea...
7918    final got smart pocket wifi stay connect anyti...
7919    appl barcelona appl store bcn barcelona travel...
Name: tweet, Length: 7920, dtype: object

In [None]:
!pip install scikit-learn

In [75]:
y_train.shape

(6336,)

In [72]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2
    )

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

print("x_train_tfidf:", x_train_tfidf.shape)
print("x_test_tfidf:", x_test_tfidf.shape)


x_train_tfidf: (6336, 5000)
x_test_tfidf: (1584, 5000)
