# **Import modules to attach and read the dataset**

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



# **Read the dataset using pandas**

In [None]:
train_df=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train_df.head()

In [None]:
train_df.describe()

In [None]:
train_df.isna().sum()

In [None]:
train_df["target"].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%', figsize=(6,6))

In [None]:
import seaborn as sns
sns.countplot(x='target',data=train_df)

In [None]:
train_df[train_df.keyword!='NaN'].value_counts()

In [None]:
train_df=train_df.drop(["location","keyword","id"], axis=1)
train_df.head()

In [None]:
train_df.isna().sum()

In [None]:
train_df.info()

# **Read the test dataset**

In [None]:
test_df=pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test_df

In [None]:
test_df.isna().sum()

In [None]:
test_df=test_df.drop(["location","keyword"], axis=1)
test_df.head()

# **Import the required NLTK modules**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import tokenize

# **Lower case the text data**

In [None]:
train_df.text=train_df.text.apply(lambda x: x.lower())
train_df

# **Expanding the contracted and abbreviated text data**

In [None]:
!pip install contractions
import contractions

In [None]:
def con(data):
  expand=contractions.fix(data)
  return expand

train_df.text=train_df.text.apply(con)
train_df['text'][0]

# **Removing the punctuations and special characters**

In [None]:
import re

def remove_sp(data):
  pattern=r'[^A-Za-z0-9\s]'
  data=re.sub(pattern,'',data)
  return data

train_df.text=train_df.text.apply(remove_sp)
train_df.text[0]

In [None]:
#import string
#punctuations=list(string.punctuation)
#train_df.text=train_df.text.apply(lambda x : " ".join(x for x in x.split() if x not in punctuations))

# **Removing Stopwords**

In [None]:
nltk.download('stopwords')
stopword_list=stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

train_df.text=train_df.text.apply(lambda x : " ".join(x for x in x.split() if x not in stopword_list))
train_df['text'][5]

# **Tokenization**

In [None]:
nltk.download('punkt')
train_df['text']=train_df.text.apply(word_tokenize)
train_df['text'][0]


# **Lemmatization**

In [None]:
nltk.download('wordnet')
lemmatizer=WordNetLemmatizer()
train_df['text']=train_df.text.apply(lambda x:[lemmatizer.lemmatize(word) for word in x])
train_df.text

In [None]:
train_df.text= train_df.text.astype(str)

In [None]:
train_df.head()

# ** Creating the features and the target variables**

In [None]:
X=train_df.text
Y=train_df.target
X_test=test_df.text

In [None]:
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.pipeline import Pipeline
#from sklearn.svm import SVC

# **Applying TFIDF (Term Frequency Inverse Document Frequency) Vectorizer to convert categorical features into numbers**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
x_train_tfidf = tfidf.fit_transform(X)


# **Applying Support Vector Machine Classifier**

In [None]:
np.random.seed(42)
from sklearn.svm import SVC
svc_clf=SVC()
svc_clf.fit(x_train_tfidf,Y)
svc_clf.score(x_train_tfidf,Y)

In [None]:
np.random.seed(42)
from sklearn.ensemble import RandomForestClassifier
rf_clf=RandomForestClassifier()
rf_clf.fit(x_train_tfidf,Y)
rf_clf.score(x_train_tfidf,Y)

In [None]:
from mlxtend.classifier import StackingCVClassifier
scv=StackingCVClassifier(classifiers=[svc_clf,rf_clf],meta_classifier= rf_clf)
scv.fit(x_train_tfidf,Y)
scv.score(x_train_tfidf,Y)

In [None]:
np.random.seed(42)
from sklearn import linear_model
rd_clf = linear_model.RidgeClassifier()
rd_clf.fit(x_train_tfidf,Y)
rd_clf.score(x_train_tfidf,Y)

In [None]:

X_test=X_test.apply(lambda x: x.lower())
X_test=X_test.apply(con)
X_test=X_test.apply(remove_sp)
#test_df.text=test_df.text.apply(lambda x : " ".join(x for x in x.split() if x not in punctuations))
X_test=X_test.apply(lambda x : " ".join(x for x in x.split() if x not in stopword_list))
X_test=X_test.apply(word_tokenize)
X_test=X_test.apply(lambda x:[lemmatizer.lemmatize(word) for word in x])
X_test= X_test.astype(str)
x_test_tfidf = tfidf.transform(X_test)
X_test
x_test_tfidf

In [None]:
predictions=rd_clf.predict(x_test_tfidf)
predictions

In [None]:
output = pd.DataFrame({'Id': test_df.id, 'Target': predictions})
output.to_csv('my_submission.csv', index=False)