In [None]:
import numpy as np 
import pandas as pd 
import string
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [None]:
df_train = pd.read_csv('../input/nlp-getting-started/train.csv')
df_train

In [None]:
df_test = pd.read_csv('../input/nlp-getting-started/test.csv')
df_test

In [None]:
sample_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
sample_submission

### EDA :

In [None]:
df_train.shape   # There are 7613 rows and 5 columns

In [None]:
df_train.text.isnull().values.any()   # There is no null value in text column

In [None]:
df_train.location.value_counts()

In [None]:
df_train.keyword.value_counts()

In [None]:
df_train.text.describe()

In [None]:
df_train.keyword.describe()

In [None]:
df_train.location.describe()

### Dropping columns :

In [None]:
df_train.drop(['location' , 'keyword'] , axis = 1 , inplace = True)

In [None]:
df_train

In [None]:
df_train.text[:3]

### Cleaning the data :

In [None]:
df_train.columns

In [None]:
df_train['text']=df_train['text'].str.replace('.' , '')
df_train['text']=df_train['text'].str.replace(',' , '')
df_train['text']=df_train['text'].str.replace('&' , '')
df_train['text']=df_train['text'].str.lower()

### Removing punctuations :

In [None]:
string.punctuation

In [None]:
def remove_punctuation(text):
    without_punct="".join([i for i in text if i not in string.punctuation])
    return without_punct

In [None]:
df_train['text']= df_train['text'].apply(lambda x:remove_punctuation(x))

### Tokenization :

In [None]:
def tokenize(string):
    '''
    Tokenizes the string to a list of words
    '''
    word_tokens = string.split()
    return word_tokens

In [None]:
df_train['text']= df_train['text'].apply(lambda x: tokenize(x))

In [None]:
df_train.head()

### Removing stop words :

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

In [None]:
df_train['text']= df_train['text'].apply(lambda x:remove_stopwords(x))

### Stemming :

In [None]:
porter_stemmer = PorterStemmer()

In [None]:
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text
df_train['text']=df_train['text'].apply(lambda x: stemming(x))

### Lematization :

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [None]:
nltk.download('wordnet')

In [None]:
df_train['text']=df_train['text'].apply(lambda x: lemmatizer(x))

### Text vectorization :

In [None]:
## Converting the tokens back to strings to feed it into Count Vectorizer

df_train['text_strings'] = df_train['text'].apply(lambda x: ' '.join([str(word) for word in x]))

In [None]:
# df_train

In [None]:
df_train['text_strings'].head()


## Text vectorization :

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_train['text_strings'])


In [None]:
x_train = X.toarray()

In [None]:
x_train = np.array(x_train)

In [None]:
x_train.shape

In [None]:
y_train = df_train['target']

In [None]:
y_train.shape

### Fitting model: (Logistic regression)

In [None]:
clf = LogisticRegression(random_state=42)
clf.fit(x_train,y_train)

In [None]:
pred = clf.predict(x_train)

In [None]:
accuracy_score(y_train, pred)

In [None]:
classification_report(y_train, pred)


### Fitting model : (Random Forest)

In [None]:
clf2 = RandomForestClassifier(n_estimators=100, random_state=42, max_depth = 700)  #500
clf2.fit(x_train,y_train)

In [None]:
pred2 = clf2.predict(x_train)

In [None]:
accuracy_score(y_train, pred2)

### Applying on test data :

In [None]:
df_test.fillna('',inplace=True)
df_test.drop(columns=['id','keyword','location'],inplace=True)

In [None]:
df_test['text']= df_test['text'].apply(lambda x:remove_punctuation(x))
df_test['text']= df_test['text'].apply(lambda x: tokenize(x))
df_test['text']= df_test['text'].apply(lambda x:remove_stopwords(x))
df_test['text']= df_test['text'].apply(lambda x: stemming(x))

In [None]:
df_test['text_strings'] = df_test['text'].apply(lambda x: ' '.join([str(elem) for elem in x]))

In [None]:
x_test = vectorizer.transform(df_test['text_strings'])
x_test = x_test.toarray()


In [None]:
x_test = np.array(x_test)

In [None]:
y_test_pred = clf2.predict(x_test)

In [None]:
y_test_pred

### Saving output to csv :

In [None]:
submission = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
submission['target'] = y_test_pred

In [None]:
submission.head()

In [None]:
final_submission = submission[['id','target']]

In [None]:
final_submission.to_csv('final_submission.csv', index = False)