In [16]:
# General:
import tweepy           # To consume Twitter's API
import pandas as pd     # To handle data
import numpy as np      # For number computing
import dill as pickle   # For saving trained models

# For plotting and visualization:
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#machine learning & text 
from nltk.tokenize import WordPunctTokenizer
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, svm, metrics, preprocessing, grid_search

In [17]:
# model paths
path = '/Users/sifatkhan/desktop/indonesia project/models/'
file_cv_vect_ngram_model = "cv_vect_ngram_model.pk"
file_sentiment_model1 = "sentiment_model1.pk"

In [18]:
#training data
df = pd.read_csv('twitter.csv', engine='python')
df['neg_senti']= np.where(df['sentiment']<3, 1, 0)
print(df.head(), df.shape)

   sentiment                                              tweet  neg_senti
0          5  Two places I'd invest all my money if I could:...          0
1          5  Awesome! Google driverless cars will help the ...          0
2          5  Autonomous vehicles could reduce traffic fatal...          0
3          5  Really good presentation from Jan Becker on Bo...          0
4          5  Ford just revealed it's Automated Ford Fusion ...          0 (2664, 3)


In [19]:
import string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from collections import defaultdict
from nltk.stem import SnowballStemmer
def clean(input_text):

    stop = set(stopwords.words('english')) 
    exclude = set(string.punctuation)
    lemma = WordNetLemmatizer()
    snow = SnowballStemmer('english')

    #Filter ASCII characters
    non_ascii = ''.join([i if ord(i) < 128 else ' ' for i in input_text])

    #remove numbers
    number_free = "".join([i for i in non_ascii if not i.isdigit()])

    # remove punctuation
    punc_free = ''.join(ch if ch not in exclude else ' ' for ch in number_free) 
    
    #remove stop words 
    stop_free = " ".join([i for i in punc_free.lower().split() if i not in stop])

    #Lemmatization
    lemmatized = " ".join(lemma.lemmatize(word) for word in stop_free.split())
    
    #stemmers
    normalized = " ".join(snow.stem(word) for word in lemmatized.split())
    return normalized   

clean('my name is sifat and i am working as senior data scientist at smarts solutions.')

'name sifat work senior data scientist smart solut'

In [20]:
#tweet cleaner 2
from nltk.tokenize import WordPunctTokenizer
from bs4 import BeautifulSoup
import re
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))   

def tweet_cleaner(text):
        
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    
    refined_words=[]
    for word in words:
        if len(word)>2:
            refined_words.append(word)
            
    return (" ".join(words)).strip()

testing = df.tweet[:10]
test_result = []
for t in testing:
    test_result.append(clean(tweet_cleaner(t)))
test_result

['two place invest money could print self drive car',
 'awesom googl driverless car help blind travel often',
 'autonom vehicl could reduc traffic fatal',
 'realli good present jan becker bosch autom vehicl research autoauto check',
 'ford reveal autom ford fusion hybrid vehicl pretti amaz fordtrend ford test',
 'yeah throw would total beta test autonom car',
 'musk reluct partner appl googl android control autonom smart car would awesom',
 'finish sf la drive rush hour meet cant wait autonom googl car',
 'googl autonom car paid visit nvidia hq pretti cool technolog',
 'final realist timelin full autonom car capabl hat autoforum']

In [21]:
#df['cleaned_tweet'] = np.array([ clean(tweet_cleaner(tweet)) for tweet in df.tweet ])
df['cleaned_tweet'] = np.array([tweet_cleaner(tweet) for tweet in df.tweet ])
titles = df['cleaned_tweet'].fillna('')

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features = 300, 
                             ngram_range=(1, 1), 
                             stop_words='english',
                             binary=True)

# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(titles)

#save trained vectorizer
with open(path + file_cv_vect_ngram_model, 'wb') as file_cv_model:
    pickle.dump(vectorizer, file_cv_model)

In [22]:
def trained_model(model,X,y):
    
    #scoring results
    scoresAccuracy = cross_val_score(model, X, y, cv=30, scoring='accuracy')
    print('CV Accuracy {}, Average Accuracy {}'.format(scoresAccuracy, scoresAccuracy.mean()))
    scoresR = cross_val_score(model, X, y, cv=30, scoring='recall')
    print('CV Recall {}, Average Recall {}'.format(scoresR, scoresR.mean()))
    # F1 = (2 x recall x precision) / (recall + precision)
    scoresf1 = cross_val_score(model, X, y, cv=30, scoring='f1')
    print('CV F1 {}, Average F1 {}'.format(scoresf1, scoresf1.mean()))
    scoresAUC = cross_val_score(model, X, y, cv=30, scoring='roc_auc')
    print('CV AUC {}, Average AUC {}'.format(scoresAUC, scoresAUC.mean()))
    
    #output trained model
    trained_model = model.fit(X, y)
    
    return trained_model

In [23]:
# select model: logistic regression, linear svc
lr = LogisticRegression(class_weight= 'balanced') 
linearsvc = svm.LinearSVC(random_state=0, class_weight= 'balanced')
svc = svm.SVC(C=0.1, gamma=0.1, kernel='rbf', class_weight= 'balanced')

#define model, X and y
model = lr

#unpickle vectorizer
# with open(path + file_cv_vect_ngram_model, 'rb') as file_cv_model:
#     loaded_vectorizer = pickle.load(file_cv_model)
# vectorized_X = loaded_vectorizer.transform(titles)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles)
y = df.neg_senti
df.head()

Unnamed: 0,sentiment,tweet,neg_senti,cleaned_tweet
0,5,Two places I'd invest all my money if I could:...,0,two places i d invest all my money if i could ...
1,5,Awesome! Google driverless cars will help the ...,0,awesome google driverless cars will help the b...
2,5,Autonomous vehicles could reduce traffic fatal...,0,autonomous vehicles could reduce traffic fatal...
3,5,Really good presentation from Jan Becker on Bo...,0,really good presentation from jan becker on bo...
4,5,Ford just revealed it's Automated Ford Fusion ...,0,ford just revealed it s automated ford fusion ...


In [24]:
Output_model = trained_model(model,X,y)

# save trained model
with open(path + file_sentiment_model1, 'wb') as file_sentiment_model1:
    pickle.dump(Output_model, file_sentiment_model1)
    
#feature importance
all_feature_names = vectorizer.get_feature_names()
feature_importances = pd.DataFrame({'lrFeatures' : all_feature_names, 'Importance Score': Output_model.coef_[0].tolist()})
feature_importances.sort_values('Importance Score', ascending=False).head(5)

CV Accuracy [0.7752809  0.71910112 0.82022472 0.82022472 0.75280899 0.83146067
 0.80898876 0.62921348 0.52808989 0.60674157 0.46067416 0.52808989
 0.52808989 0.65168539 0.71910112 0.64044944 0.62921348 0.82022472
 0.65168539 0.64044944 0.64044944 0.66292135 0.68539326 0.64044944
 0.69662921 0.70454545 0.70454545 0.69318182 0.65909091 0.6091954 ], Average Accuracy 0.6752733162700228
CV Recall [0.65384615 0.73076923 0.73076923 0.69230769 0.65384615 0.76923077
 0.84615385 0.69230769 0.26923077 0.46153846 0.5        0.65384615
 0.57692308 0.69230769 0.73076923 0.65384615 0.57692308 0.84615385
 0.76923077 0.61538462 0.69230769 0.53846154 0.80769231 0.57692308
 0.65384615 0.72       0.88       0.64       0.68       0.76      ], Average Recall 0.6688205128205128
CV F1 [0.62962963 0.6031746  0.7037037  0.69230769 0.60714286 0.72727273
 0.72131148 0.52173913 0.25       0.40677966 0.35135135 0.44736842
 0.41666667 0.53731343 0.6031746  0.51515152 0.47619048 0.73333333
 0.56338028 0.5        0.52

Unnamed: 0,lrFeatures,Importance Score
125,jobs,2.308946
52,doesn,1.99693
95,government,1.768871
129,law,1.569731
54,dont,1.51688


In [66]:
testt= 'hi there'
test_tweet = vectorizer.transform([testt])
type(float(Output_model.predict(test_tweet)[0]))

float

In [67]:
def get_sentiment(text):
    text= vectorizer.transform([text])
    sentiment = Output_model.predict(text)[0]
    return float(sentiment)
get_sentiment('government cars')    

1.0

In [10]:
def svc_param_selection(X, y, nfolds):

#     Cs = [0.001, 0.01, 0.1, 1, 10, 100, 110]
#     gammas = [0.001, 0.01, 0.1, 1]
    Cs =[0.001, 0.01, 0.1]
    gammas = [0.001, 0.01, 0.1]

    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = model_selection.GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    
    return grid_search.best_params_

In [11]:
params = svc_param_selection(X,y, 11) 

params_C = params.get("C", "none")
params_gamma = params.get("gamma", "none")
print(params_C, params_gamma)
classifier = svm.SVC(C=params_C, gamma=params_gamma, kernel='rbf')
classifier.fit(X,y)

0.001 0.001


SVC(C=0.001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)