In [1]:
import pandas as pd
import numpy as np
import re 
import nltk

## Loading a dataset

In [2]:
twitter = pd.read_csv('twitter-sentiment-analysis/train.csv',encoding='latin-1').loc[0:10000]
twitter.head(10)

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...
5,6,0,or i just worry too much?
6,7,1,Juuuuuuuuuuuuuuuuussssst Chillin!!
7,8,0,Sunny Again Work Tomorrow :-| ...
8,9,1,handed in my uniform today . i miss you ...
9,10,1,hmmmm.... i wonder how she my number @-)


In [3]:
twitter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 3 columns):
ItemID           10001 non-null int64
Sentiment        10001 non-null int64
SentimentText    10001 non-null object
dtypes: int64(2), object(1)
memory usage: 234.5+ KB


## Text cleaning
- Lower the text
- Removing the Special Characters

In [4]:
def textprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z]+',' ',text)
    return text

In [5]:
twitter['SentimentText'] = twitter['SentimentText'].apply(textprocess)
twitter.head(10)

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my apl friend
1,2,0,i missed the new moon trailer
2,3,1,omg its already o
3,4,0,omgaga im sooo im gunna cry i ve been at this...
4,5,0,i think mi bf is cheating on me t t
5,6,0,or i just worry too much
6,7,1,juuuuuuuuuuuuuuuuussssst chillin
7,8,0,sunny again work tomorrow tv tonight
8,9,1,handed in my uniform today i miss you already
9,10,1,hmmmm i wonder how she my number


## Lemma 
- Taking a Root Word

In [6]:
# To take a root word
from nltk.stem import wordnet
# POS is imported for PARTS OF SPEECH
from nltk import pos_tag

In [7]:
lemma = wordnet.WordNetLemmatizer()

In [8]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
# Lemmatizer normally aiming to remove inflectional endings only and to return the base or dictionary form of a word,
# which is known as the lemma .
def lemmatizer(postag):
    word,pos = postag
    if pos.startswith('R'): # R = Adverb 
        pos = 'r'
    elif pos.startswith('V'): # v = Verb
        pos = 'v'
    elif pos.startswith('J'): # j = Adjective
        pos = 'a'
    else:
        pos = 'n' # n = Noun
    return lemma.lemmatize(word,pos=pos)

In [10]:
def joinlemma(sent):
    tokens = sent.split()
    pos = pos_tag(tokens)
    return " ".join([lemmatizer(tag) for tag in pos])

In [11]:
twitter['lemm'] = twitter['SentimentText'].apply(joinlemma)
twitter.head(10)

Unnamed: 0,ItemID,Sentiment,SentimentText,lemm
0,1,0,is so sad for my apl friend,be so sad for my apl friend
1,2,0,i missed the new moon trailer,i miss the new moon trailer
2,3,1,omg its already o,omg it already o
3,4,0,omgaga im sooo im gunna cry i ve been at this...,omgaga im sooo im gunna cry i ve be at this de...
4,5,0,i think mi bf is cheating on me t t,i think mi bf be cheat on me t t
5,6,0,or i just worry too much,or i just worry too much
6,7,1,juuuuuuuuuuuuuuuuussssst chillin,juuuuuuuuuuuuuuuuussssst chillin
7,8,0,sunny again work tomorrow tv tonight,sunny again work tomorrow tv tonight
8,9,1,handed in my uniform today i miss you already,hand in my uniform today i miss you already
9,10,1,hmmmm i wonder how she my number,hmmmm i wonder how she my number


## Feature Extraction

## TFIDF

$tfidf = log(TF+1) * log(\frac{N}{df+1})$

- TF = Term Frequency
- DF = Document Frequency
- N = Number of times word is repeated in entire corpus
term = word

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.feature_extraction.text import CountVectorizer

In [16]:
tfidf = TfidfVectorizer()
#cv = CountVectorizer()

In [17]:
X = tfidf.fit_transform(twitter['lemm']).toarray()
#X = cv.fit_transform(twitter['SentimentText']).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [21]:
tweepy = pd.DataFrame(X,columns=tfidf.get_feature_names())

In [22]:
tweepy.to_csv('Twitterdata',index=False)

In [23]:
import networkx as nx

In [24]:
!pip install mlxtend

Collecting mlxtend
  Downloading https://files.pythonhosted.org/packages/c0/ca/54fe0ae783ce81a467710d1c5fb41cfca075121139b48327b807020dc40c/mlxtend-0.16.0-py2.py3-none-any.whl (1.3MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.16.0


In [26]:
tweepydata = pd.read_csv('Twitterdata')
tweepydata

Unnamed: 0,aa,aaaaaaah,aaaaoooowwwww,aaaarrgghhhhhhhh,aaah,aaahh,aaahhhhh,aaahhhhhh,aaarrrgggghhh,aah,...,zumba,zune,zxcv,zyzl,zzj,zzsp,zzzz,zzzzzs,zzzzzzzzzzzz,zzzzzzzzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
