In [1]:
# Import the basic libraries
import pandas as pd
pd.set_option('display.max_colwidth',None)

In [2]:
# Load data
df = pd.read_csv('/home/hari/Documents/MLAI/Datasets/tweets.csv')

In [3]:
# Display a part of the data
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [4]:
# Split the data into train and test.
from sklearn.model_selection import train_test_split
train, test = train_test_split(df[['tweet','label']], test_size=0.2, random_state=42, shuffle=True)

In [5]:
train.reset_index(inplace=True)
test.reset_index(inplace=True)

#### Data Preprocessing

In [6]:
import re
def remove_hashtags(text):
    cleaned_text = re.sub(r'#\w+', '', text)
    return cleaned_text

def remove_urls(text):
    url_pattern = re.compile(r'http?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
    
import string
def remove_punc(text):
    punc_free = ''.join([i for i in text if i not in string.punctuation])
    return punc_free

import nltk
def tokenization(text):
    words = nltk.word_tokenize(text)
    return words

stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    output = [i for i in text if i not in stopwords]
    return output

from nltk.stem import WordNetLemmatizer
wordnet_lem = WordNetLemmatizer()
def lemm(text):
    lemm_text = [wordnet_lem.lemmatize(word) for word in text]
    return lemm_text

In [7]:
def preprocess(df_col):
    corpus = []
    for item in df_col:
        new_item = remove_hashtags(item)
        new_item = remove_urls(new_item)
        new_item = remove_punc(item)
        new_item = new_item.lower()
        new_item = tokenization(new_item)
        new_item = remove_stopwords(new_item)
        new_item = lemm(new_item)
        corpus.append(' '.join(str(x) for x in new_item))
    return corpus

In [8]:
corpus = preprocess(train['tweet'])

In [9]:
# Using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,2))
traindata = tfidf.fit_transform(corpus)

In [10]:
# Splitting the data into X and y
X = traindata
y = train['label']

In [11]:
# Using SMOTE to handle the imbalanced data
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=33)
X, y= smote.fit_resample(X, y)

In [12]:
# Using nested Cross Valiadation for getting the best parameters
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

random_forest = RandomForestClassifier()

# Define parameter grid for hyperparameter tuning
param_grid_rf = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the outer cross-validation strategy
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform nested cross-validation for Random Forest
grid_search_rf = GridSearchCV(estimator=random_forest, param_grid=param_grid_rf, scoring='accuracy', cv=3,n_jobs=4)
nested_score_rf = cross_val_score(grid_search_rf, X, y, cv=outer_cv, scoring='accuracy')

print(f'Random Forest Nested Cross-Validation Accuracy: {nested_score_rf.mean()}')

Random Forest Nested Cross-Validation Accuracy: 0.9341000891925738


In [13]:
grid_search_rf.fit(X,y)

In [14]:
best_params_rf = grid_search_rf.best_params_
print("Best Parameters for Random Forest:", best_params_rf)

Best Parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}


In [15]:
# Preprocessing the test data.
test_corpus = preprocess(test['tweet'])

In [16]:
test_data = tfidf.transform(test_corpus)

In [17]:
test_label = test['label']

In [18]:
predictions = grid_search_rf.predict(test_data)

In [19]:
# Prediction with test data.
rf_accuracy = accuracy_score(test_label,predictions)
print("The accuracy is ",rf_accuracy*100)

The accuracy is  88.69949494949495


In [20]:
# Using Naive Bayes
from sklearn.naive_bayes import GaussianNB
GNBclf = GaussianNB()
GNBclf.fit(X.toarray(),y)

In [21]:
predictions_NB = GNBclf.predict(test_data.toarray())

In [22]:
nb_accuracy = accuracy_score(test_label,predictions_NB)
print("The accuracy is ",nb_accuracy*100)

The accuracy is  83.52272727272727


##### Using gensim

In [23]:
# Create word embeddings
import gensim
w2v_model = gensim.models.Word2Vec(corpus,min_count=1)
y = train['label']

In [24]:
# Applying those embeddings
import numpy as np
words = w2v_model.wv.index_to_key
X_train_vec = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])for ls in corpus],dtype=object)
X_test_vec = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])for ls in test_corpus],dtype=object)

In [25]:
X_train_vec_avg =[]
for v in X_train_vec:
    if v.size:
        X_train_vec_avg.append(v.mean(axis=0))
    else:
        X_train_vec_avg.append(np.zeros(100,dtype=float))

X_test_vec_avg =[]
for v in X_test_vec:
    if v.size:
        X_test_vec_avg.append(v.mean(axis=0))
    else:
        X_test_vec_avg.append(np.zeros(100,dtype=float))

In [26]:
# Using smote to handle the imbalance
smote = SMOTE(random_state=33)
X_train_vec_avg, y= smote.fit_resample(X_train_vec_avg, y)

In [27]:
clf = RandomForestClassifier(max_depth = None, min_samples_leaf =1, min_samples_split = 5, n_estimators =100)
clf.fit(X_train_vec_avg,y)

In [28]:
y_preds = clf.predict(X_test_vec_avg)

In [29]:
# Prediction with the test data
pred_gensim_rf = accuracy_score(test_label,y_preds)
print("The accuracy is ",pred_gensim_rf*100)

The accuracy is  84.28030303030303


In [30]:
clf_nb = GaussianNB()
clf_nb.fit(X_train_vec_avg,y)

In [31]:
y_preds_nb = clf_nb.predict(X_test_vec_avg)

In [32]:
# Prediction with the test data
pred_gensim_nb = accuracy_score(test_label,y_preds_nb)
print("The accuracy is ",pred_gensim_nb*100)

The accuracy is  75.88383838383838
