# Imports

In [53]:
import numpy as np
import pandas as pd
import nltk
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer,SnowballStemmer
from sklearn.svm import SVC  

# Reading data

### The data can be dowloaded on http://help.sentiment140.com/for-students/

In [54]:
df = pd.read_csv('dataset/trainning-tweets.csv',encoding = "ISO-8859-1",names=['sentiment','id','date','Query','user','tweet']) 

In [55]:
X = df.tweet.values.tolist()
y = df.sentiment.values.tolist()

### Raw data example

In [56]:
print(X[0])
print(y[0])

@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
0


# Cleanning

### Remove links

In [57]:
X_nolinks = [re.sub(r'http\S+', '', sentence) for sentence in X]
X_nolinks = [re.sub(r'https\S+', '', sentence) for sentence in X_nolinks]
X_nolinks = [re.sub(r'www\S+', '', sentence) for sentence in X_nolinks]

### Remove sitações do twitter como @user

In [58]:
X_cleanned = [re.sub(r'@\S+', '', sentence) for sentence in X_nolinks]

### Remove hashtags do twitter como #perplex

In [59]:
X_cleanned = [re.sub(r'#\S+', '', sentence) for sentence in X_cleanned]

### Remove caracteres especiais e pontuações 

In [60]:
X_cleanned = [re.sub(r"[^a-zA-Z0-9]+", ' ', sentence.lower()) for sentence in X_cleanned]

### Remove words with 2 or less letters

In [61]:
X_cleanned = [re.sub(r'\W*\b\w{1,2}\b', '', sentence) for sentence in X_cleanned]

### Diferença do texto antes e depois do cleanning

In [62]:
print(X[0])
print(X_cleanned[0])

@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
 that bummer you shoulda got david carr third day


# Stemming with SnowballStemmer

In [63]:
snowball=SnowballStemmer(language="english")

In [64]:
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(snowball.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

### O próximo bloco demora uns 8 minutos

In [65]:
X_steammed = []
for sentence in tqdm(X_cleanned):
    new_sentence = stemSentence(sentence)
    X_steammed.append(new_sentence)

100%|██████████| 1600000/1600000 [05:57<00:00, 4475.65it/s]


### Diferença do texto antes e depois do stemming

In [66]:
print(X_cleanned[0])
print(X_steammed[0])

 that bummer you shoulda got david carr third day
that bummer you shoulda got david carr third day 


# Caracterização dos textos com TFIDF e remoção de stopwords

In [67]:
vect = TfidfVectorizer(stop_words='english')
X_tfidf = vect.fit_transform(X_steammed)

### How the data is organized in tfidf

In [68]:
print(X_steammed[0])
print(X_tfidf[0])

that bummer you shoulda got david carr third day 
  (0, 35471)	0.4205305051274454
  (0, 176449)	0.49846217907581125
  (0, 83008)	0.2198740823945423
  (0, 53000)	0.3971277257260902
  (0, 38731)	0.5732665182470988
  (0, 53110)	0.20000001673969783


# Trainning and predicting

### TODO