# Mount Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive/')

In [0]:
cd 'drive/My Drive/Colab Notebooks'

# Colab Setup for GPU

In [0]:
!git clone --recursive https://github.com/Microsoft/LightGBM
%cd LightGBM

In [0]:
!mkdir build

In [0]:
!cmake -DUSE_GPU=1 #avoid ..
!make -j$(nproc)

In [0]:
%cd /content/LightGBM/python-package

In [0]:
!sudo python setup.py install — precompile

In [0]:
!sudo -H pip install vaderSentiment emoji -U

# Sentiment Analysis

In [0]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
import emoji
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools
%matplotlib inline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import hstack, csr_matrix, vstack
import pickle as pickle
analyzer_emoji = SentimentIntensityAnalyzer()

In [0]:
data = pd.read_csv('sentiment_data.csv')
data.drop('Unnamed: 0',axis=1,inplace=True)
data.sentiment = data.sentiment.map({'positive':0,'negative':1})
data.head()

In [0]:
def extract_emojis(str):
    return [c for c in str if c in emoji.UNICODE_EMOJI]
def sentiment_emojis(sentence):
    emojis = extract_emojis(sentence)
    result = [0,0,0,0]
    if len(emojis) == 0:
        return result
    for icon in emojis:
        sen_dict = analyzer_emoji.polarity_scores(icon)
        sen = [sen_dict['neg'],sen_dict['neu'],sen_dict['pos'],sen_dict['compound']]
        result = [result[i] + sen[i] for i in range(4)]
    return [result[i] / len(emojis) for i in range(4)]
def sentiment_emojis_row(row):
    comment = row['text']
    sen_comment = sentiment_emojis(comment)
    
    row['emoji_neg'] = sen_comment[0]
    row['emoji_neu'] = sen_comment[1]
    row['emoji_pos'] = sen_comment[2]
    row['emoji_compound'] = sen_comment[3]
    
    return row

In [0]:
df = data.copy()

In [0]:
'''Maybe nltk porter stemmer or punctuation can be try'''

df['text'] = df['text'].astype(str).fillna(' ')
# Lower case comment
df['text'] = df['text'].str.lower()
# Add num words of comment as feature
df['num_words'] = df['text'].apply(lambda s: len(s.split()))
# Add num words unique of comment as feature
df['num_unique_words'] = df['text'].apply(lambda s: len(set(w for w in s.split())))
# Add num words unique per num words of comment as feature
df['words_vs_unique'] = df['num_unique_words'] / df['num_words'] * 100
# Add emojis features
print("Statistical features end!")

Statistical features end!


In [0]:
from sklearn.model_selection import train_test_split

train_df, test_df= train_test_split(df, test_size=0.25, random_state=39, stratify=df.sentiment)
y_train = train_df['sentiment'].values

In [0]:
# Just keep statistic feature to process by model
EXCLUED_COLS = ['text','sentiment','tweet_id', 'user_id', 'user_screen_name', 'user_name', 'created_at', 'static_link','Tag','retweets', 'favorites']
static_cols = [c for c in train_df.columns if not c in EXCLUED_COLS]
print(static_cols)
X_train_static = train_df[static_cols]
X_test_static = test_df[static_cols]
print(X_train_static.shape, X_test_static.shape)

['num_words', 'num_unique_words', 'words_vs_unique']
(3591483, 3) (1197162, 3)


In [0]:
tfidf = TfidfVectorizer(
    min_df = 5, 
    max_df = 0.8, 
    max_features=10000,
    sublinear_tf=True
)

In [0]:
train_comments = train_df['text'].values
test_comments = test_df['text'].values
X_train_tfidf = tfidf.fit_transform(train_comments)
X_test_tfidf = tfidf.transform(test_comments)

In [0]:
with open('vectorizer.pk', 'wb') as fin:
    pickle.dump(tfidf, fin)

In [0]:
#X_train = hstack([X_train_tfidf, csr_matrix(X_train_static)]).tocsr()
#X_test = hstack([X_test_tfidf, csr_matrix(X_test_static)]).tocsr()

X_train = hstack([X_train_tfidf]).tocsr()
X_test = hstack([X_test_tfidf]).tocsr()

In [0]:
X_train_split, X_valid, y_train_split, y_valid = train_test_split(X_train_tfidf, y_train, test_size=0.1)

In [0]:
param = {'num_leaves':100, 'num_trees':300, 'objective':'binary', "max_bin":255, "learning_rate":0.1}

In [0]:
train_data = lgb.Dataset(X_train_split, y_train_split)
valid_data = lgb.Dataset(X_valid, y_valid)

In [0]:
bst = lgb.train(param, train_data, num_boost_round=100, valid_sets=[valid_data])



[1]	valid_0's binary_logloss: 0.604456
[2]	valid_0's binary_logloss: 0.570083
[3]	valid_0's binary_logloss: 0.542205
[4]	valid_0's binary_logloss: 0.519108
[5]	valid_0's binary_logloss: 0.499579
[6]	valid_0's binary_logloss: 0.48308
[7]	valid_0's binary_logloss: 0.468679
[8]	valid_0's binary_logloss: 0.456291
[9]	valid_0's binary_logloss: 0.445421
[10]	valid_0's binary_logloss: 0.435887
[11]	valid_0's binary_logloss: 0.42745
[12]	valid_0's binary_logloss: 0.419877
[13]	valid_0's binary_logloss: 0.412988
[14]	valid_0's binary_logloss: 0.406981
[15]	valid_0's binary_logloss: 0.401456
[16]	valid_0's binary_logloss: 0.396606
[17]	valid_0's binary_logloss: 0.392069
[18]	valid_0's binary_logloss: 0.388168
[19]	valid_0's binary_logloss: 0.38454
[20]	valid_0's binary_logloss: 0.381338
[21]	valid_0's binary_logloss: 0.378339
[22]	valid_0's binary_logloss: 0.375641
[23]	valid_0's binary_logloss: 0.373194
[24]	valid_0's binary_logloss: 0.370939
[25]	valid_0's binary_logloss: 0.368855
[26]	valid_0

In [0]:
print("accuracy: {}".format(accuracy_score(y_valid, 1*(bst.predict(X_valid)>0.5))))

accuracy: 0.8464787595120689


In [0]:
new_pred = ["çok kötüyüm"]
new_pred = tfidf.transform(new_pred)
bst.predict(new_pred)

array([0.54027656])

In [0]:
filename = 'lgbm_model.sav'
pickle.dump(bst, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(new_pred)
print(result)

[0.54027656]
