In [3]:
import numpy as np
import pandas as pd
import nltk

In [4]:
# Load the data
df = pd.read_csv('/home/brian/Documents/nlp_tweets.csv', encoding='ISO-8859-1', header = None)
df.columns = ['IDK', 'tweetId', 'Datetime', 'Query?', 'Username', 'tweet']


In [None]:
df.head()

In [6]:
# sample
sample = df.tweet[4]
sample

"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. "

In [9]:
# Tokenize

from nltk import word_tokenize, FreqDist

def tokenz(text, n):
    tokens = word_tokenize(text)

    freq_distro = FreqDist(tokens)

    return freq_distro.most_common(n)

tokenz(df.tweet[4], 10)


[('.', 3),
 ('all', 2),
 ('i', 2),
 ('@', 1),
 ('nationwideclass', 1),
 ('no', 1),
 (',', 1),
 ('it', 1),
 ("'s", 1),
 ('not', 1)]

In [8]:
#--------REMOVE STOPWORDS---------------
from nltk.corpus import stopwords

def stopword_remover(str):
    tokens = word_tokenize(str)

    eng_stopwords = stopwords.words('english')

    return [word for word in tokens if word.lower() not in eng_stopwords]

        
df.tweet.head(5).apply(stopword_remover)

0    [@, switchfoot, http, :, //twitpic.com/2y1zl, ...
1    [upset, ca, n't, update, Facebook, texting, .....
2    [@, Kenichan, dived, many, times, ball, ., Man...
3              [whole, body, feels, itchy, like, fire]
4    [@, nationwideclass, ,, 's, behaving, ., 'm, m...
Name: tweet, dtype: object

In [9]:
#----------------REMOVE NONALPHABETS--------------------
def non_alpha_remover(str):
    return [x for x in stopword_remover(str) if x.isalpha()]

df.tweet.head(10).apply(non_alpha_remover)

0    [switchfoot, http, Awww, bummer, shoulda, got,...
1    [upset, ca, update, Facebook, texting, might, ...
2    [Kenichan, dived, many, times, ball, Managed, ...
3              [whole, body, feels, itchy, like, fire]
4            [nationwideclass, behaving, mad, ca, see]
5                              [Kwesidei, whole, crew]
6                                          [Need, hug]
7    [LOLTrish, hey, long, time, see, Yes, Rains, b...
8                                               [nope]
9                               [twittera, que, muera]
Name: tweet, dtype: object

In [10]:
#--------------DETERMINE COMPLEXITY----------------------

def det_complexity(str):
    all_tokens = word_tokenize(str)

    uniq_tokens = set(word_tokenize(str))

    complexity = len(uniq_tokens) / len(all_tokens)
    
    return complexity

df['tweet'].head(10).apply(det_complexity)

0    0.962963
1    1.000000
2    1.000000
3    1.000000
4    0.866667
5    1.000000
6    1.000000
7    0.857143
8    1.000000
9    1.000000
Name: tweet, dtype: float64

In [11]:
#------------POLARITY AND SUBJECTIVITY---------------
from textblob import TextBlob

def pol_and_sub(text, print_results = False):
    txtb = TextBlob(text)

    if print_results:
        print("Polarity is: ", round(txtb.sentiment[0], 2), "and Subjectivity is: ", round(txtb.sentiment[1], 2))
    else:
        return(round(txtb.sentiment[0], 1), round(txtb.sentiment[1], 1))
    
df['tweet'].head(10).apply(pol_and_sub)

0     (0.2, 0.6)
1     (0.0, 0.0)
2     (0.5, 0.5)
3     (0.2, 0.4)
4    (-0.6, 1.0)
5     (0.2, 0.4)
6     (0.0, 0.0)
7     (0.3, 0.6)
8     (0.0, 0.0)
9     (0.0, 0.0)
Name: tweet, dtype: object