### Import libraries

In [None]:
import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer


In [None]:
from scipy.stats import randint


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV , cross_val_score
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.svm import SVC

In [None]:
import re
import os
from tqdm import tqdm
import string
from collections import defaultdict
from collections import  Counter


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant


from keras.optimizers import Adam

In [None]:
plt.style.use('ggplot')
stop_words =set(stopwords.words('english'))

In [None]:
import random
random.seed(31415)
np.random.seed(31415)
import warnings
warnings.filterwarnings('ignore')

### Basic EDA 

In [None]:
tweets_train = pd.read_csv('data/train.csv')
tweets_test = pd.read_csv('data/test.csv')
tweets_train.head(3)

In [None]:
print('There are {} rows and {} columns in train'.format(tweets_train.shape[0],tweets_train.shape[1]))
print('There are {} rows and {} columns in train'.format(tweets_test.shape[0],tweets_test.shape[1]))

In [None]:
x=tweets_train.target.value_counts()
sns.barplot(x.index,x)
plt.gca().set_ylabel('samples')

###  Text Preprocess

In [None]:
df=pd.concat([tweets_train,tweets_test] , sort = False)
df.shape

In [None]:
df

#### preprocess text

Based on the Text analysis

In [None]:
def preprocess_text(text):
    
    # remove numbers
    text = re.sub(r'[0-9]+', '', text)
    
    ## Remove URLs
    url = re.compile(r'https?://\S+|www\.\S+')
    text1 = url.sub(r'',text)
    
    ## Remove HTML tags 
    html = re.compile(r'<.*?>')
    text2 = html.sub(r'',text1)
    
    ## Remove Emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text2 = emoji_pattern.sub(r'', text2)

    ## Remove punctuation
    
    table= str.maketrans('','',string.punctuation)
    text3 = text2.translate(table)
    
    
    
    return text3



In [None]:
df.text = df.text.apply(preprocess_text)
df.head(10)

#### Tokenization and normalization

In [None]:
from nltk.tokenize import WhitespaceTokenizer

In [None]:
def nltk_preprocess_text(text):
    
    text = WhitespaceTokenizer().Tokenize(text)
    
    
    # lemmatize
    lmtzr = WordNetLemmatizer()
    text = ' '.join((lmtzr.lemmatize(i)) for i in text)
    
    return text

In [None]:
df.text = df.text.apply(preprocess_text)
df.head(10)

In [None]:
train_data = df.text[:7613]
test_data = df.text[7613:]

In [None]:
train_data

In [None]:
# count_vectorizer = CountVectorizer(analyzer = 'word', ngram_range=(1, 2) ,stop_words = stop_words, max_df = 0.9)
tfidf_vectorizer = TfidfVectorizer(analyzer = 'word' , ngram_range = (2,2), stop_words=stop_words, 
                                  max_df =1.0 , min_df = 0.00 )   

In [None]:
train_vectors = tfidf_vectorizer.fit_transform(train_data)


test_vectors = tfidf_vectorizer.transform(test_data)

In [None]:
train_vectors_dense = train_vectors.toarray()

In [None]:
train_vectors

In [None]:
test_vectors

In [None]:
test_vectors_dense = test_vectors.toarray()

### Modelling

#### Logistic Regression

In [None]:
# Setup the hyperparameter grid

# c_space = np.logspace(-8, 2, 10)
param_grid = {'max_iter' : [10, 5] }

# Instantiate a logistic regression classifier: logreg
logreg = linear_model.LogisticRegression( penalty = 'none', solver ='lbfgs' )

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv=5 , scoring = 'f1')

# Fit it to the data
logreg_cv.fit(train_vectors, tweets_train['target'])

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))

#### Ridge Classifier

In [None]:
ridge_param_grid = {'alpha' : np.logspace(-2,1,20)}

In [None]:
ridge_clf = linear_model.RidgeClassifier()
# ridge_clf  = linear_model.LogisticRegression()

In [None]:

ridgeclf_cv = GridSearchCV(estimator = ridge_clf, param_grid = ridge_param_grid , cv=5 , scoring = 'f1' , n_jobs = -1)


ridgeclf_cv.fit(train_vectors, tweets_train['target'])


print("Tuned Logistic Regression Parameters: {}".format(ridgeclf_cv.best_params_)) 
print("Best score is {}".format(ridgeclf_cv.best_score_))

In [None]:
scores = cross_val_score(ridge_clf, train_vectors , tweets_train['target'] , cv=5 , scoring = 'f1')
scores

In [None]:
ridge_clf.fit(train_vectors, tweets_train["target"])

In [None]:
ridge_clf_test_preds = ridge_clf.predict(test_vectors)
ridge_clf_test_preds

#### ElasticNet Classifier

In [None]:
# Setup the hyperparameter grid

c_space = np.logspace(-8, 2, 10)
l1_ratio = np.logspace(-1, 0, 6)
elasticnet_param_grid = {'C' : c_space,'max_iter' : [10, 5] , 'l1_ratio' : [0.1, 0.3, 0.5, 0.7, 0.] }

# Instantiate a logistic regression classifier: elasticnet_clf
elasticnet_clf = linear_model.LogisticRegression( penalty = 'elasticnet', class_weight = 'balanced', solver = 'saga')

# Instantiate the GridSearchCV object: elasticnet_clf_cv
elasticnet_clf_cv = GridSearchCV(elasticnet_clf, elasticnet_param_grid, cv=5 , scoring = 'f1')

# Fit it to the data
elasticnet_clf_cv.fit(train_vectors, tweets_train['target'])

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(elasticnet_clf_cv.best_params_)) 
print("Best score is {}".format(elasticnet_clf_cv.best_score_))

#### Naive Bayes , LDA 

#### Decision Tree Classifier

In [None]:


dec_tree_param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 2000),
              "min_samples_leaf": randint(1, 20),
              "criterion": ["gini", "entropy"]}


dec_tree = DecisionTreeClassifier()


dec_tree_cv = RandomizedSearchCV(estimator = dec_tree, 
                                 param_distributions = dec_tree_param_dist, 
                                 cv=5, scoring = 'f1')


dec_tree_cv.fit(train_vectors, tweets_train['target'])


print("Tuned Decision Tree Parameters: {}".format(dec_tree_cv.best_params_))
print("Best score is {}".format(dec_tree_cv.best_score_))

### Ensemble methods

In [None]:

### Bagging Classifier , Random Forests
### Boosting - Adaboost, Gradient Boosting

### Voting classifier

#### Bagging - Random Forests classifier

In [None]:
# Define the dictionary 'params_rf_clf'
params_rf_clf = {
    'n_estimators' : [6, 8, 10, 12] , 'criterion' : ['entropy'],
    'max_features' : ['auto', 'sqrt'], 'class_weight' : ['balanced'] ,
    'min_samples_leaf' : [2]
}

In [None]:
rf_clf = RandomForestClassifier()

# Instantiate grid_rf_clf
grid_rf_clf = GridSearchCV(estimator = rf_clf,
                       param_grid = params_rf_clf,
                       scoring= 'f1',
                       cv = 5,
                       n_jobs=-1)

In [None]:
# Fit it to the data
grid_rf_clf.fit(train_vectors, tweets_train['target'])

In [None]:
print("Tuned Random Forests Parameters: {}".format(grid_rf_clf.best_params_))
print("Best score is {}".format(grid_rf_clf.best_score_))

In [None]:
print("Tuned Random Forests Parameters: {}".format(grid_rf_clf.best_params_))
print("Best score is {}".format(grid_rf_clf.best_score_))

In [None]:
best_rf_clf = grid_rf_clf.best_estimator_
best_rf_clf

In [None]:
best_rf_clf.fit(train_vectors, tweets_train['target'])

In [None]:
# Create a pd.Series of features importances
importances = pd.Series(data = best_rf_clf.feature_importances_,
                        index = count_vectorizer.get_feature_names() )

In [None]:

# Sort importances
importances_sorted = importances.sort_values()[-50:]


In [None]:
plt.figure(figsize = (20, 20))
# Draw a horizontal barplot of importances_sorted
importances_sorted.plot(kind='barh', color='lightgreen')
plt.title('Features Importances')
plt.show()

#### Boosting - Adaboost Classifier

In [None]:
# Import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# Define the dictionary 'params_adaboost_clf'
params_adaboost_clf = {
    'n_estimators' : [1, 2 ]  , 'base_estimator' : [ logreg ]
}

In [None]:

# Instantiate adaboost_clf
adaboost_clf = AdaBoostClassifier()

# Instantiate grid_rf_clf
grid_adaboost_clf = GridSearchCV(estimator = adaboost_clf,
                       param_grid = params_adaboost_clf,
                       scoring= 'f1',
                       cv = 3,
                       n_jobs=-1)

In [None]:
grid_adaboost_clf.fit(train_vectors, tweets_train['target'])

In [None]:
print("Tuned Adaboost Parameters: {}".format(grid_adaboost_clf.best_params_))
print("Best score is {}".format(grid_adaboost_clf.best_score_))

In [None]:
best_adaboost_clf = grid_adaboost_clf.best_estimator_
best_adaboost_clf

#### Support Vector Machines

In [None]:
np.linspace(1 ,10, 10) 

In [None]:
svm_param_grid = {'C' : np.logspace(0, 1, 12) , 'gamma': ['auto', 'scale']  }

In [None]:
svm_clf = SVC(kernel='rbf')

In [None]:
# Instantiate the GridSearchCV object: logreg_cv
svmclf_cv = GridSearchCV(estimator = svm_clf, param_grid = svm_param_grid , cv=5 , scoring = 'f1' , n_jobs = -1)

# Fit it to the data
svmclf_cv.fit(train_vectors, tweets_train['target'])

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(svmclf_cv.best_params_)) 
print("Best score is {}".format(svmclf_cv.best_score_))

#### Combine best models through VotingClassifier

In [None]:
# Import VotingClassifier from sklearn.ensemble
from sklearn.ensemble import VotingClassifier

In [None]:
classifiers = [('Logistic Regression', logreg), ('random forest', best_rf_clf) , 
               ( 'Adaboost classifier' , best_adaboost_clf )]

# Instantiate a VotingClassifier voting_clfs
voting_clfs = VotingClassifier(estimators=classifiers , voting = 'soft' ) 

In [None]:
voting_clfs_scores = cross_val_score(voting_clfs, train_vectors , tweets_train['target'] , cv=5 , scoring = 'f1')
voting_clfs_scores

In [None]:
voting_clfs_scores.mean()

### Submit results

In [None]:
# sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [None]:
sample_submission["target"] = clf.predict(test_vectors)


In [None]:
sample_submission

In [None]:
sample_submission.to_csv("submission.csv", index=False)