In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# loading necessary libraries for NLP
from sklearn.model_selection import train_test_split
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

In [None]:
#load the data
data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')

data_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')


In [None]:
#Data cleaning and data preparing 
#at first attempt we try the model by dropping  Keyword and location fielf
data1 = data
data1.drop(['keyword','location'],axis =1,inplace = True)
data_test.drop(['keyword','location'],axis =1,inplace = True)

In [None]:
# major preprocessing steps for NLP is
#1.Removal of punctuations, numbers
#2.Removing stopwords
#3.Tokenization
#4.converting into lower case
#5.Text normlaization i.e stemming/lemmatization
#6.BOW,Creating DTM using countvectorizer or tfidfvectorizer

In [None]:
# Removing Pucntuations,numbers and converting the text into lower case 
def rem_punct(text):
    re = "".join([char.lower() for char in text if char not in string.punctuation and  not char.isnumeric()])
    return re
data1['text1'] = data1['text'].apply(lambda x:rem_punct(x))
data_test['text1'] = data_test['text'].apply(lambda x:rem_punct(x))


# Removing stopwords and tokenizing the words
def rem_stopwords(text):
    re = [word for word in word_tokenize(text) if word not in set(stopwords.words('english'))]
    return re
data1['text2'] = data1['text1'].apply(lambda x: rem_stopwords(x))
#data_test['text2'] = data_test['text1'].apply(lambda x:rem_stopwords(x))

In [None]:
# Text normoalization ,i.e stemming & lemmatization
def lem(text):
    wnl = WordNetLemmatizer()
    re = [wnl.lemmatize(word) for word in text]
    return re
data1['text3'] = data1['text2'].apply(lambda x:lem(x))
#data_test['text3'] = data_test['text2'].apply(lambda x:lem(x))

In [None]:
# seperate the target variable from the actual data
data1_target = data1['target']
data1.drop(['target'],axis =1,inplace = True)
data1.head(3)


In [None]:
# now convert this into document term matrix and finally attach the target variable and sent this to any classification model
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize,max_df=5000,min_df=5)
text_counts= cv.fit_transform(data1['text1'])
text_counts.shape

In [None]:
# trying with tf-idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
tf = TfidfVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize,max_df=5000,min_df=5)
text_counts_tf= tf.fit_transform(data1['text1'])
text_counts_tf.shape

In [None]:
# now convert this into document term matrix and finally attach the target variable and sent this to any classification model
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
tf_test = TfidfVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize,max_df=2000,min_df=5)
test_text_counts_tf= tf.transform(data_test['text1'])
test_text_counts_tf.shape



In [None]:
#split the data set into train and test and build the model
x_train,x_test,y_train,y_test = train_test_split(text_counts_tf,data1_target,test_size =0.2,random_state = 1)

In [None]:
# Random forest classifier with BOW
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rc  = RandomForestClassifier()
rc_model = rc.fit(x_train,y_train)
rc_predicted = rc_model.predict(x_test)
rc_accuracy = accuracy_score(y_test,rc_predicted)
rc_accuracy

In [None]:
#NB
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
nb = MultinomialNB()

nb_model = nb.fit(x_train,y_train)
nb_predicted = nb_model.predict(x_test)
nb_accuracy  = metrics.accuracy_score(y_test,nb_predicted)
nb_accuracy

In [None]:
#SVM
from sklearn.svm import SVC
sv = SVC()
sv_model = sv.fit(x_train,y_train)
sv_predicted = sv_model.predict(x_test)
sv_accuracy = accuracy_score(y_test,sv_predicted)
sv_accuracy

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb_model = xgb.fit(x_train,y_train)
xgb_predicted = xgb_model.predict(x_test)
xgb_accuracy = accuracy_score(y_test,xgb_predicted)
xgb_accuracy

In [None]:
# hyper parameter training
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1, 1, 1.5],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf','linear']}  
  
grid = GridSearchCV(SVC(), param_grid,cv = 10, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(text_counts_tf, data1_target) 

In [None]:
grid.best_params_
sv_model

In [None]:
#predicting the test dataset 
test_predictions = sv_model.predict(test_text_counts_tf)


In [None]:

submission = pd.DataFrame({'id':data_test['id'],'target':test_predictions})
submission.head()
filename = 'Text_classification_Predictions1.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)