In [None]:
#-----Importing Required Libraries-----#
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import Word
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report


In [None]:
#---Text Preprocessing-----------#

def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'ve", "", string)
    string = re.sub(r"n\'t", "", string)
    string = re.sub(r"\'re", "", string)
    string = re.sub(r"\'d", "", string)
    string = re.sub(r"\'ll", "", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", "", string)
    string = re.sub(r"'", "", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"[0-9]\w+|[0-9]","", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
x = data['text'].tolist()
y = data['target'].tolist()

for index,value in enumerate(x):
    print("processing data:",index)
    x[index] = ' '.join([Word(word).lemmatize("v") for word in clean_str(value).split()])

vect = TfidfVectorizer(stop_words='english',min_df=2)
X = vect.fit_transform(x)
Y = np.array(y)
print("no of features extracted:",X.shape[1])

In [None]:
#-------Splitting data into Train-Test----------#

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

print("train size:", X_train.shape)
print("test size:", X_test.shape)

#-------Building Model--------------------------#

model = RandomForestClassifier(n_estimators=300, max_depth=150,n_jobs=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
c_mat = confusion_matrix(y_test,y_pred)
cls_report=classification_report(y_test,y_pred)
print(cls_report)

In [None]:
#----------Feteching prediction using built model-------------#
test_data=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test_text=test_data['text']
test_text_vect = vect.transform(test_text)
test_prediction=model.predict(test_text_vect)
final_prediction=pd.DataFrame(test_prediction)
Result=pd.concat([test_data['id'],final_prediction],axis=1)
Result.to_csv('prediction.csv',index=False)