In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re

from gensim.utils import simple_preprocess

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

import xgboost as xgb
from xgboost import XGBClassifier

import os

import numpy as np
import pandas as pd
from collections import Counter

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_data = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
submission_sample = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
def extract_places(text):
    places = GeoText(text)
    countries = ", ".join(places.countries)
    cities = ", ".join(places.cities)
    if (len(places.cities) > 0 & len(places.countries) > 0):
        return cities + ", " + countries
    elif (len (places.cities) > 0):
        return cities
    else:
        return countries
    
def pre_process(s): #Remove Punctuations
    s = s.str.lower() #Lower all text
    s = s.str.replace(r'(?i)\brt\b', "", regex = True)
    s = s.str.replace(' via ',"") 
    s = s.replace(r'@\w+', "", regex = True)
    s = s.replace(r'http\S+', '', regex = True)
    s = s.replace(r'www.[^ ]+', '', regex = True)
    s = s.replace(r'[0-9]+', '', regex = True)
    s = s.replace(r'''[¬!"#$%&()*+,-./:;<=>?@[\]’^'_`\{|}~]''', '', regex = True)
    return s    

stop_words = stopwords.words('english')
stop_words.extend(['dont', 'like', 'im', 'would', 'amp'])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatizing(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

def final_text(words):
     return ' '.join(words)
    

In [None]:
copy_train = train_data.drop(columns = ['target',]).copy()
copy_train.head()

In [None]:
#train_data['places'] = train_data.text.apply(lambda x: extract_places(x))
#test_data['places'] = test_data.text.apply(lambda x: extract_places(x))

copy_train['clean_text'] = pre_process(copy_train.text)
test_data['clean_text'] = pre_process(test_data.text)

copy_train['clean_text'] = remove_stopwords(copy_train.clean_text)
test_data['clean_text'] = remove_stopwords(test_data.clean_text)

copy_train['token'] = copy_train['clean_text'].apply(lambda x: lemmatizing(x))
test_data['token'] = test_data['clean_text'].apply(lambda x: lemmatizing(x))

copy_train['clean_text'] = copy_train['clean_text'].apply(lambda x: final_text(x))
test_data['clean_text'] = test_data['clean_text'].apply(lambda x: final_text(x))

In [None]:
a = copy_train['token']
a = [x for i in a for x in i]
top_20_train = pd.DataFrame(Counter(a).most_common(20), columns=['word', 'frequency'])
print(top_20_train)


In [None]:
b = test_data['token']
b = [x for i in b for x in i]
top_20_test = pd.DataFrame(Counter(b).most_common(20), columns=['word', 'frequency'])
print(top_20_test)

In [None]:
copy_train.drop(columns = ['keyword', 'location', 'text', 'token'], inplace = True)
test_data.drop(columns = ['keyword', 'location', 'text', 'token'], inplace = True)


In [None]:
vectorizer = CountVectorizer(max_features=2500, min_df = 5, max_df = 0.5)
X = vectorizer.fit_transform(copy_train.clean_text).toarray()
y = train_data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20)
param_grid = {'n_estimators': range(6, 10),'max_depth': range(3, 8),'learning_rate': [.2, .3, .4],'colsample_bytree': [.7, .8, .9, 1]}
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

In [None]:
count_data = pd.DataFrame(X_train, columns=vectorizer.get_feature_names())
count_data['target'] = y_train

g_search = GridSearchCV(estimator = xgb, param_grid = param_grid, cv = 3, n_jobs = 1, verbose = 0, refit = True, return_train_score=True)
g_search.fit(X_train, y_train)

In [None]:
g_search.cv_results_['mean_train_score'][0:10]

In [None]:
g_search.cv_results_['mean_test_score'][0:10]

In [None]:
print(g_search.best_params_)

In [None]:
clf_test = g_search.best_estimator_
predictions = clf_test.predict(X_test)
print(classification_report(y_test, predictions)) 

In [None]:
vec_test = CountVectorizer(max_features=2500, min_df = 5, max_df = 0.5, vocabulary=vectorizer.get_feature_names())
test_data_vec = vec_test.fit_transform(test_data.clean_text).toarray()

In [None]:
new_predictions = clf_test.predict(test_data_vec)
submission = pd.DataFrame({'id': test_data['id'],'target': new_predictions })

In [None]:
print(classification_report(submission_sample['target'], submission['target'])) 

In [None]:
submission.to_csv("submission.csv", index=False)