## Competition organised by [AnalyticsVidhya](http://analyticsvidhya.com/)

### Problem Statement
The problem is based on the sentiment analysis, we have been given the reviews based on products and we have to predict the its sentiment. Sentiment is classified in two parts 0 and 1 (0 negative and 1 positive).

### Datasets
The dataset contains two features 
1. Label (0 and 1)
2. Tweets

We have provided three dataset train, test and sample submission.

### Metrics used
Here, we have to use f1-score to compete.

#### Import dependencies

In [1]:
#import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, classification_report

%matplotlib inline

plt.rcParams['figure.figsize']= (15, 12)
plt.style.use('ggplot')
sns.set(color_codes= True)

#### Read the data

In [2]:
df_train= pd.read_csv('train.csv')
df_test= pd.read_csv('test.csv')
df_samp= pd.read_csv('sample.csv')

#### Data preprocessing

In [3]:
import re

def preprocessing_text_length(tweets):
    num_words= []
    num_char= []
    avg_word_len= []
    num_stopwords= []
    num_special_char= []
    num_upper_cases= []
    num_numerics= []

    for tweet in tweets:
        #remove links 
        #tweet= re.sub(r'(http|https|ftp)://[a-zA-Z0-9\./]+', '', tweet, flags= re.I)
        #tweet = re.sub(r'^https?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE)
        
        tweet= tweet.strip().split()
        
        #num of words
        words= [w for w in tweet]
        num_words.append(len(words))
        
        #num_char
        chars= len(tweet)
        num_char.append(chars)
        
        #num_avg word length
        words_avg_len= [sum(len(w) for w in tweet)/len(words)]
        avg_word_len.append(words_avg_len[0])
        
        #number of stop words
        from nltk.corpus import stopwords
        stop= stopwords.words('english')
        stopword= [w for w in tweet if w in stop]
        num_stopwords.append(len(stopword))
        
        #number of special character
        hastags= [w for w in tweet if w.startswith('#')]
        num_special_char.append(len(hastags))
        
        #number of numerics
        numerics= [w for w in tweet if w.isdigit()]
        num_numerics.append(len(numerics))
        
        #number of upper cases
        upper_cases= [w for w in tweet if w.isupper()]
        num_upper_cases.append(len(upper_cases))
        
    return num_char, num_numerics, num_special_char, num_stopwords, num_upper_cases, num_words, avg_word_len

#### Concatenation of data

In [4]:
df_all= pd.concat([df_train, df_test])

num_chars, num_numeric, num_special_chars, num_stopword, num_upper_case, num_word, avg_words_len= preprocessing_text_length(df_all['tweet'])

df_all['num_chars']= num_chars
df_all['num_numeric']= num_numeric
df_all['num_special_chars']= num_special_chars
df_all['num_stopword']= num_stopword
df_all['num_upper_case']= num_upper_case
df_all['num_word']= num_word
df_all['avg_words_len']= avg_words_len

#### Cleaning of the data

In [5]:
def basic_preprocessing(tweets):
    
    #remove punctuations
    tweets= tweets.str.replace('[^\w\s]', '')
    
    #remove stopwords
    from nltk.corpus import stopwords
    stop= stopwords.words('english')
    
    tweets= tweets.apply(lambda x: ' '.join(w for w in x.split() if w not in stop))
    
    #common word removal
    freq= pd.Series(' '.join(tweets).split()).value_counts()[:10]
    
    freq= list(freq)
    tweets= tweets.apply(lambda x: ' '.join(w for w in x.split() if w not in freq))
    
    #rare word removal
    freq= pd.Series(' '.join(tweets).split()).value_counts()[-10:]
    
    freq= list(freq)
    tweets= tweets.apply(lambda x: ' '.join(w for w in x.split() if w not in freq))
    
    #spelling correction
    #from textblob import TextBlob
    #tweets= tweets.apply(lambda x: str(TextBlob(x).correct()))
    
    #tokenize
    from nltk.tokenize import TreebankWordTokenizer
    tokenizer= TreebankWordTokenizer()
    #tweets= tweets.apply(lambda x: (' '.join(w) for w tokenizer.tokenize(x)))
    
    #stemming
    #from nltk.stem import PorterStemmer
    #stemmer= PorterStemmer()
    #tweets= tweets.apply(lambda x: ' '.join[stemmer.stem(w for w in tokenizer.tokenize(x))])
    
    #lemmatization
    from nltk.stem import WordNetLemmatizer
    lemmatizer= WordNetLemmatizer()
    #tweets= tweets.apply(lambda x: lemmatizer.lemmatize(w) for w in x)
    tweets= tweets.apply(lambda x: tokenizer.tokenize(x))
    tweets= tweets.apply(lambda x: [lemmatizer.lemmatize(w) for w in x])
    tweets= tweets.apply(lambda x: ' '.join(w for w in x))
    
    #convert it into lower case
    tweets= tweets.apply(lambda tweet: tweet.strip().lower())
                                             
    return tweets

tweets= df_all['tweet']
tweets= basic_preprocessing(tweets)

df_all['tweets']= tweets

In [6]:
df_all.head()

Unnamed: 0,id,label,tweet,num_chars,num_numeric,num_special_chars,num_stopword,num_upper_case,num_word,avg_words_len,tweets
0,1,0.0,#fingerprint #Pregnancy Test https://goo.gl/h1...,13,0,11,0,0,13,8.923077,fingerprint pregnancy test httpsgooglh1mfqv an...
1,2,0.0,Finally a transparant silicon case ^^ Thanks t...,17,0,5,3,1,17,6.764706,finally transparant silicon case thanks uncle ...
2,3,0.0,We love this! Would you go? #talk #makememorie...,15,0,8,1,0,15,7.266667,we love would go talk makememories unplug rela...
3,4,0.0,I'm wired I know I'm George I was made that wa...,17,0,4,2,2,17,5.647059,im wired i know im george i made way iphone cu...
4,5,1.0,What amazing service! Apple won't even talk to...,23,0,0,9,2,23,4.434783,what amazing service apple wont even talk ques...


#### Split the data 

In [9]:
train= df_all[:len(df_train)]
test= df_all[len(df_train):]

### Machine learning modeling

In [10]:
from scipy.sparse import hstack  ##it is used to add the columns in sparse matrix

In [11]:
col_to_add= train.columns.tolist()

In [12]:
col_to_add.remove('id')
col_to_add.remove('label')
col_to_add.remove('tweet')
col_to_add.remove('tweets')

### Applied machine learning algorithms

In [13]:
def ml_modeling(model, train, target, test, countVectorizer= True, tfidfVectorizer= False, col_to_add= col_to_add):
    
    #splitted the dataset in training and validation set (splitted at 25%)
    X_train, X_test, y_train, y_test= train_test_split(train, target, random_state= 5) 
    
    #feature extraction techniques (count vectorizer and tfidf vectorizer)
    if countVectorizer:  
        vect= CountVectorizer().fit(X_train['tweets'])
        X_train_vect= vect.transform(X_train['tweets'])
        X_test_vect= vect.transform(X_test['tweets'])
        test_vect= vect.transform(test['tweets'])
        
        #now add the columns to the sparse matrix using the scipy library
        for col in col_to_add:
            X_train_vect= hstack((X_train_vect, np.array(X_train[col])[:, None]))
            X_test_vect= hstack((X_test_vect, np.array(X_test[col])[:, None]))
            test_vect= hstack((test_vect, np.array(test[col])[:, None]))
        
        print('X_train_vect', X_train_vect.shape)
        print('X_test_vect', X_test_vect.shape)
        print('test_vect', test_vect.shape)
        
        #modeling and prediction
        model.fit(X_train_vect, y_train)
        prediction= model.predict(X_test_vect)
        
        #print accuracies
        train_acc= model.score(X_train_vect, y_train)
        test_acc= accuracy_score(y_test, prediction)
        f1= f1_score(y_test, prediction)
        
        print('Training accuracy: {}' .format(train_acc))
        print('Testing accuracy: {}' .format(test_acc))
        print('f1 score: {}' .format(f1))
        
        print('Classification Report: ')
        print(classification_report(y_test, prediction))
        
    if tfidfVectorizer:
        vect= TfidfVectorizer(min_df= 5, ngram_range= (1, 3)).fit(X_train['tweets'])
        X_train_vect= vect.transform(X_train['tweets'])
        X_test_vect= vect.transform(X_test['tweets'])
        test_vect= vect.transform(test['tweets'])
        
        #now add the columns to the sparse matrix using the scipy library
        for col in col_to_add:
            X_train_vect= hstack((X_train_vect, np.array(X_train[col])[:, None]))
            X_test_vect= hstack((X_test_vect, np.array(X_test[col])[:, None]))
            test_vect= hstack((test_vect, np.array(test[col])[:, None]))
        
        print('X_train_vect', X_train_vect.shape)
        print('X_test_vect', X_test_vect.shape)
        print('test_vect', test_vect.shape)
        
        #modeling and prediction
        model.fit(X_train_vect, y_train)
        prediction= model.predict(X_test_vect)
        
        #print accuracies
        train_acc= model.score(X_train_vect, y_train)
        test_acc= accuracy_score(y_test, prediction)
        f1= f1_score(y_test, prediction)
        
        print('Training accuracy: {}' .format(train_acc))
        print('Testing accuracy: {}' .format(test_acc))
        print('f1 score: {}' .format(f1))
        
        print('Classification Report: ')
        print(classification_report(y_test, prediction))
        
    return model, model.predict(test_vect)

In [14]:
from sklearn.linear_model import LogisticRegression

In [16]:
clf_log= LogisticRegression(class_weight= 'balanced', random_state= 0)

In [17]:
#using count vectorizer
model_1, pred_1= ml_modeling(clf_log, train.drop(['id', 'tweet', 'label'], axis= 1), train['label'],
                             test.drop(['id', 'tweet', 'label'], axis= 1)) ## called the ml function

X_train_vect (5940, 18026)
X_test_vect (1980, 18026)
test_vect (1953, 18026)
Training accuracy: 0.9683501683501684
Testing accuracy: 0.8984848484848484
f1 score: 0.8157653528872594
Classification Report: 
             precision    recall  f1-score   support

        0.0       0.96      0.90      0.93      1475
        1.0       0.76      0.88      0.82       505

avg / total       0.91      0.90      0.90      1980



In [18]:
#using tfidf vectorizer
model_2, pred_2= ml_modeling(clf_log, train.drop(['id', 'tweet', 'label'], axis= 1), train['label'],
                             test.drop(['id', 'tweet', 'label'], axis= 1), False, True)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


X_train_vect (5940, 3189)
X_test_vect (1980, 3189)
test_vect (1953, 3189)
Training accuracy: 0.9104377104377105
Testing accuracy: 0.8909090909090909
f1 score: 0.8128249566724437
Classification Report: 
             precision    recall  f1-score   support

        0.0       0.97      0.88      0.92      1475
        1.0       0.72      0.93      0.81       505

avg / total       0.91      0.89      0.89      1980



##### This function directly return the tweets after vecorize it, there in no need to write it again and again

In [19]:
def vectorizer(train, target, test, countVectorizer= True, tfidfVectorizer= False, col_to_add= col_to_add):
    
    X_train, X_test, y_train, y_test= train_test_split(train, target, random_state= 5)
    
    if countVectorizer:
        vect= CountVectorizer().fit(X_train['tweets'])
        X_train_vect= vect.transform(X_train['tweets'])
        X_test_vect= vect.transform(X_test['tweets'])
        test_vect= vect.transform(test['tweets'])
        
        #now add the columns to the sparse matrix using the scipy library
        for col in col_to_add:
            X_train_vect= hstack((X_train_vect, np.array(X_train[col])[:, None]))
            X_test_vect= hstack((X_test_vect, np.array(X_test[col])[:, None]))
            test_vect= hstack((test_vect, np.array(test[col])[:, None]))
        
        print('X_train_vect', X_train_vect.shape)
        print('X_test_vect', X_test_vect.shape)
        print('test_vect', test_vect.shape)
        
    if tfidfVectorizer:
        vect= TfidfVectorizer(min_df= 5, ngram_range= (1, 3)).fit(X_train['tweets'])
        X_train_vect= vect.transform(X_train['tweets'])
        X_test_vect= vect.transform(X_test['tweets'])
        test_vect= vect.transform(test['tweets'])
        
        #now add the columns to the sparse matrix using the scipy library
        for col in col_to_add:
            X_train_vect= hstack((X_train_vect, np.array(X_train[col])[:, None]))
            X_test_vect= hstack((X_test_vect, np.array(X_test[col])[:, None]))
            test_vect= hstack((test_vect, np.array(test[col])[:, None]))
        
        print('X_train_vect', X_train_vect.shape)
        print('X_test_vect', X_test_vect.shape)
        print('test_vect', test_vect.shape)
        
    return X_train_vect, X_test_vect, test_vect, y_train, y_test

In [20]:
##using count vectorizer
X_train_count, X_test_count, test_count, y_train_c, y_test_c= vectorizer(train.drop(['id', 'tweet', 'label'], axis= 1), train['label'],
                                                    test.drop(['id', 'tweet', 'label'], axis= 1))

X_train_vect (5940, 18026)
X_test_vect (1980, 18026)
test_vect (1953, 18026)


In [21]:
##using tfidf vectorizer
X_train_tfidf, X_test_tfidf, test_tfidf, y_train_t, y_test_t= vectorizer(train.drop(['id', 'tweet', 'label'], axis= 1), train['label'],
                                                    test.drop(['id', 'tweet', 'label'], axis= 1), False, tfidfVectorizer= True)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


X_train_vect (5940, 3189)
X_test_vect (1980, 3189)
test_vect (1953, 3189)


In [22]:
X_train_count.shape, X_test_count.shape, test_count.shape, y_train_c.shape, y_test_c.shape

((5940, 18026), (1980, 18026), (1953, 18026), (5940,), (1980,))

In [23]:
X_train_tfidf.shape, X_test_tfidf.shape, test_tfidf.shape, y_train_c.shape, y_test_t.shape

((5940, 3189), (1980, 3189), (1953, 3189), (5940,), (1980,))

#### Deep learning implementation

Now, here, I'm going to implement the ANN model for the sake of getting the better result.

#### Dependencies related to deep learning

In [24]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LeakyReLU, Activation
from keras.losses import binary_crossentropy, categorical_crossentropy, sparse_categorical_crossentropy
from keras.optimizers import SGD, Adam

Using TensorFlow backend.


#### Make data for deep learning

In [23]:
from scipy.sparse.csr import csr_matrix

In [24]:
X_train_count= csr_matrix(X_train_count)
X_test_count= csr_matrix(X_test_count)
test_count= csr_matrix(test_count)
#X_test_count_1= X_test_count

### Model declaration
Here, I used
1. Used 3 hidden layer
2. Used hidden nodes (100, 50, 20)
3. Used ReLu and sigmoid activation function
4. Used stochastic gradient decent optimizer
5. Used binary cross entropy loss function

In [25]:
model= Sequential()
model.add(Dense(units= 100, activation= 'relu', input_dim= X_train_count.shape[1], kernel_initializer= 'uniform'))
model.add(Dense(units= 50, activation= 'relu'))
model.add(Dense(units= 20, activation= 'sigmoid'))
model.add(Dense(units= 1, activation= 'sigmoid'))

sgd= SGD(lr= 0.1)

model.compile(optimizer= sgd, loss= 'binary_crossentropy', metrics= ['accuracy'])
model.fit(x= X_train_count, y= y_train_c, batch_size= 128, epochs= 100, class_weight= 'balanced', shuffle= False,
          validation_data= (X_test_count, y_test_c))

y_pred_c= model.predict_classes(X_test_count, batch_size= 128)

#print accuracies
train_acc= model.evaluate(X_train_count, y_train_c, batch_size= 128)
test_acc= accuracy_score(y_test_c, y_pred_c)
f1= f1_score(y_test_c, y_pred_c)

print('Detail of training losses and accuracies: ', train_acc)
print('Testing accuracy: {}' .format(test_acc))
print('f1 score: {}' .format(f1))

y_pred_real_c= model.predict_classes(test_count)

Train on 5940 samples, validate on 1980 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Detail of training losses and accuracies:  [0.2371006815822839, 0.8968013468414846]
Testing accuracy: 0.8919191919191919
f1 score: 0.8047445255474452


##### With two hidden layers and adam optimizer

In [30]:
model= Sequential()
model.add(Dense(units= 50, activation= 'relu', input_dim= X_train_count.shape[1], kernel_initializer= 'uniform'))
model.add(Dense(units= 20, activation= 'relu'))
#model.add(Dense(units= 20, activation= 'relu'))
model.add(Dense(units= 1, activation= 'sigmoid'))

#sgd= SGD(lr= 0.1)

model.compile(optimizer= 'adam', loss= 'binary_crossentropy', metrics= ['accuracy'])
model.fit(x= X_train_count, y= y_train_c, batch_size= 128, epochs= 5, class_weight= 'balanced', shuffle= False,
          validation_data= (X_test_count, y_test_c))

y_pred_c= model.predict_classes(X_test_count, batch_size= 128)

#print accuracies
train_acc= model.evaluate(X_train_count, y_train_c, batch_size= 128)
test_acc= accuracy_score(y_test_c, y_pred_c)
f1= f1_score(y_test_c, y_pred_c)

print('Detail of training losses and accuracies: ', train_acc)
print('Testing accuracy: {}' .format(test_acc))
print('f1 score: {}' .format(f1))

y_pred_real_c_2= model.predict_classes(test_count)

Train on 5940 samples, validate on 1980 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Detail of training losses and accuracies:  [0.09593365830102754, 0.9705387205387206]
Testing accuracy: 0.9025252525252525
f1 score: 0.8194574368568756


In [34]:
model= Sequential()
model.add(Dense(units= 100, activation= 'relu', input_dim= X_train_count.shape[1], kernel_initializer= 'uniform'))
model.add(Dense(units= 10, activation= 'relu'))
#model.add(Dense(units= 20, activation= 'relu'))
model.add(Dense(units= 1, activation= 'sigmoid'))

#sgd= SGD(lr= 0.1)

model.compile(optimizer= 'adam', loss= 'binary_crossentropy', metrics= ['accuracy'])
model.fit(x= X_train_count, y= y_train_c, batch_size= 128, epochs= 5, class_weight= 'balanced', shuffle= False,
          validation_data= (X_test_count, y_test_c))

y_pred_c= model.predict_classes(X_test_count, batch_size= 128)

#print accuracies
train_acc= model.evaluate(X_train_count, y_train_c, batch_size= 128)
test_acc= accuracy_score(y_test_c, y_pred_c)
f1= f1_score(y_test_c, y_pred_c)

print('Detail of training losses and accuracies: ', train_acc)
print('Testing accuracy: {}' .format(test_acc))
print('f1 score: {}' .format(f1))

y_pred_real_c_2= model.predict_classes(test_count)

Train on 5940 samples, validate on 1980 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Detail of training losses and accuracies:  [0.10107676187398458, 0.97003367003367]
Testing accuracy: 0.9025252525252525
f1 score: 0.8187793427230047


### Using tfidf vectorizer

In [56]:
X_train_tfidf= csr_matrix(X_train_tfidf)
X_test_tfidf= csr_matrix(X_test_tfidf)
test_tfidf= csr_matrix(test_tfidf)

In [57]:
model= Sequential()
model.add(Dense(units= 100, activation= 'relu', input_dim= X_train_tfidf.shape[1], kernel_initializer= 'uniform'))
model.add(Dense(units= 50, activation= 'relu'))
model.add(Dense(units= 20, activation= 'relu'))
model.add(Dense(units= 1, activation= 'sigmoid'))

#sgd= SGD(lr= 0.1)

model.compile(optimizer= 'adam', loss= 'binary_crossentropy', metrics= ['accuracy'])
model.fit(x= X_train_tfidf, y= y_train_t, batch_size= 128, epochs= 15, class_weight= 'balanced', shuffle= True,
          validation_data= (X_test_tfidf, y_test_t))

y_pred_t= model.predict_classes(X_test_tfidf, batch_size= 128)

#print accuracies
train_acc= model.evaluate(X_train_tfidf, y_train_t, batch_size= 128)
test_acc= accuracy_score(y_test_t, y_pred_t)
f1= f1_score(y_test_t, y_pred_t)

print('Detail of training losses and accuracies: ', train_acc)
print('Testing accuracy: {}' .format(test_acc))
print('f1 score: {}' .format(f1))

y_pred_real_1= model.predict_classes(test_tfidf)

Train on 5940 samples, validate on 1980 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Detail of training losses and accuracies:  [0.053460718861934715, 0.9873737373737373]
Testing accuracy: 0.8757575757575757
f1 score: 0.7592954990215263


In [45]:
model= Sequential()
model.add(Dense(units= 100, activation= 'relu', input_dim= X_train_count.shape[1], kernel_initializer= 'uniform'))
model.add(Dense(units= 50, activation= 'relu'))
model.add(Dense(units= 20, activation= 'relu'))
model.add(Dense(units= 1, activation= 'sigmoid'))

#sgd= SGD(lr= 0.1)

model.compile(optimizer= 'adam', loss= 'binary_crossentropy', metrics= ['accuracy'])
model.fit(x= X_train_count, y= y_train_c, batch_size= 128, epochs= 10, shuffle= True)#, validation_data= (X_test_count_1, y_test_c))

y_pred_c= model.predict_classes(X_test_count, batch_size= 128)

#print accuracies
train_acc= model.evaluate(X_train_count, y_train_c, batch_size= 128)
test_acc= accuracy_score(y_test_c, y_pred_c)
f1= f1_score(y_test_c, y_pred_c)

print('Detail of training losses and accuracies: ', train_acc)
print('Testing accuracy: {}' .format(test_acc))
print('f1 score: {}' .format(f1))

y_pred_real= model.predict_classes(test_count)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Detail of training losses and accuracies:  [0.0026310378710679735, 0.9998316498316498]
Testing accuracy: 0.8792929292929293
f1 score: 0.7553735926305015


In [61]:
X_train_count.shape, test_count.shape

((5940, 18026), (1953, 22158))

In [47]:
model= Sequential()
model.add(Dense(units= 200, activation= 'relu', input_dim= X_train_tfidf_.shape[1]))
model.add(Dropout(0.1, seed= 0))
model.add(Dense(units= 20, activation= 'relu'))
model.add(Dropout(0.1, seed= 0))
model.add(Dense(units= 1, activation= 'sigmoid'))

model.compile(optimizer= 'adam', loss= 'binary_crossentropy', metrics= ['accuracy'])

model.fit(X_train_tfidf_, y_train_t_, batch_size= 200, epochs= 5, validation_data= (X_test_tfidf_, y_test_t_))

y_pred= model.predict_classes(X_test_tfidf_, batch_size= 200)

train_acc= model.evaluate(X_train_tfidf_, y_train_t_, batch_size= 200)

test_acc= accuracy_score(y_test_t_, y_pred)

f1= f1_score(y_test_t_, y_pred)

print(train_acc)
print(test_acc)
print(f1)

y_pred_real_tf= model.predict_classes(test_tfidf_, batch_size= 32)

Train on 5940 samples, validate on 1980 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[0.1683602602995606, 0.9368686866679978]
0.8994949494949495
0.8131455399061033


### Used ensmble method

In [27]:
from sklearn.ensemble import BaggingClassifier, VotingClassifier, AdaBoostClassifier

In [79]:
clf_bag_log= BaggingClassifier(clf_log, random_state= 0, max_samples= 0.8)

In [80]:
model_5, pred_5= ml_modeling(clf_bag_log, train, train['label'], test)

X_train_vect (5940, 18820)
X_test_vect (1980, 18820)
test_vect (1953, 18820)
Training accuracy: 0.9488215488215488
Testing accuracy: 0.8964646464646465
f1 score: 0.8138056312443233
Classification Report: 
             precision    recall  f1-score   support

        0.0       0.96      0.90      0.93      1475
        1.0       0.75      0.89      0.81       505

avg / total       0.91      0.90      0.90      1980



In [81]:
clf_ada_log= AdaBoostClassifier(clf_log, learning_rate= 0.1)

In [82]:
model_6, pred_6= ml_modeling(clf_ada_log, train, train['label'], test)

X_train_vect (5940, 18820)
X_test_vect (1980, 18820)
test_vect (1953, 18820)
Training accuracy: 0.8604377104377104
Testing accuracy: 0.8641414141414141
f1 score: 0.7726120033812343
Classification Report: 
             precision    recall  f1-score   support

        0.0       0.96      0.85      0.90      1475
        1.0       0.67      0.90      0.77       505

avg / total       0.89      0.86      0.87      1980



In [83]:
clf_vote= VotingClassifier([('log', clf_log), ('bag_log', clf_bag_log)], voting= 'soft')

In [84]:
model_7, pred_7= ml_modeling(clf_vote, train, train['label'], test)

X_train_vect (5940, 18820)
X_test_vect (1980, 18820)
test_vect (1953, 18820)
Training accuracy: 0.9631313131313132
Testing accuracy: 0.8939393939393939
f1 score: 0.8073394495412844
Classification Report: 
             precision    recall  f1-score   support

        0.0       0.95      0.90      0.93      1475
        1.0       0.75      0.87      0.81       505

avg / total       0.90      0.89      0.90      1980



  if diff:
  if diff:
  if diff:


In [88]:
def ml_modeling(model, train, target, test, countVectorizer= True, tfidfVectorizer= False, col_to_add= col_to_add):
    
    X_train, X_test, y_train, y_test= train_test_split(train, target, random_state= 5)
    
    if countVectorizer:
        vect= CountVectorizer().fit(X_train['tweets'])
        X_train_vect= vect.transform(X_train['tweets'])
        X_test_vect= vect.transform(X_test['tweets'])
        test_vect= vect.transform(test['tweets'])
        
        #now add the columns to the sparse matrix using the scipy library
        for col in col_to_add:
            X_train_vect= hstack((X_train_vect, np.array(X_train[col])[:, None]))
            X_test_vect= hstack((X_test_vect, np.array(X_test[col])[:, None]))
            test_vect= hstack((test_vect, np.array(test[col])[:, None]))
        
        print('X_train_vect', X_train_vect.shape)
        print('X_test_vect', X_test_vect.shape)
        print('test_vect', test_vect.shape)
        
        #modeling and prediction
        model.fit(X_train_vect, y_train)
        prediction= model.predict(X_test_vect)
        
        #print accuracies
        train_acc= model.score(X_train_vect, y_train)
        test_acc= accuracy_score(y_test, prediction)
        f1= f1_score(y_test, prediction)
        
        print('Training accuracy: {}' .format(train_acc))
        print('Testing accuracy: {}' .format(test_acc))
        print('f1 score: {}' .format(f1))
        
        print('Classification Report: ')
        print(classification_report(y_test, prediction))
        
    if tfidfVectorizer:
        vect= TfidfVectorizer(min_df= 5, ngram_range= (1, 3)).fit(X_train['tweets'])
        X_train_vect= vect.transform(X_train['tweets'])
        X_test_vect= vect.transform(X_test['tweets'])
        test_vect= vect.transform(test['tweets'])
        
        #now add the columns to the sparse matrix using the scipy library
        for col in col_to_add:
            X_train_vect= hstack((X_train_vect, np.array(X_train[col])[:, None]))
            X_test_vect= hstack((X_test_vect, np.array(X_test[col])[:, None]))
            test_vect= hstack((test_vect, np.array(test[col])[:, None]))
        
        print('X_train_vect', X_train_vect.shape)
        print('X_test_vect', X_test_vect.shape)
        print('test_vect', test_vect.shape)
        
        #modeling and prediction
        model.fit(X_train_vect, y_train)
        prediction= model.predict(X_test_vect)
        
        #print accuracies
        train_acc= model.score(X_train_vect, y_train)
        test_acc= accuracy_score(y_test, prediction)
        f1= f1_score(y_test, prediction)
        
        print('Training accuracy: {}' .format(train_acc))
        print('Testing accuracy: {}' .format(test_acc))
        print('f1 score: {}' .format(f1))
        
        print('Classification Report: ')
        print(classification_report(y_test, prediction))
        
    return model, model.predict(test_vect)

In [89]:
model_10, pred_10= ml_modeling(clf_bag_log, train, train['label'], test)

X_train_vect (5940, 18026)
X_test_vect (1980, 18026)
test_vect (1953, 18026)
Training accuracy: 0.9484848484848485
Testing accuracy: 0.8984848484848484
f1 score: 0.8161024702653248
Classification Report: 
             precision    recall  f1-score   support

        0.0       0.96      0.90      0.93      1475
        1.0       0.76      0.88      0.82       505

avg / total       0.91      0.90      0.90      1980



### Finally, I got the best result using deep learning with f1-score 0.81945