In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Performing Stopwords removal and other operations on the data available.

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS
df = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
df["question_text"] = df['question_text'].str.replace('[^\w\s]','')
df["question_text"] = df['question_text'].str.replace('\d+', '')
df["question_text"] = df['question_text'].str.lower()
df['question_text'] = df['question_text'].apply(lambda x: ' '.join([item for item in x.split() if item not in STOP_WORDS]))


df_test = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")
df_test['question_text']=df_test['question_text'].str.replace('[^\w\s]','')
df_test["question_text"] = df_test['question_text'].str.replace('\d+', '')
df_test["question_text"] = df_test['question_text'].str.lower()
df_test['question_text'] = df_test['question_text'].apply(lambda x: ' '.join([item for item in x.split() if item not in STOP_WORDS]))


In [None]:
df.head()

## Tried upsampling and downsampling. Both provided same result.

In [None]:
df_majority = df[df.target==0]
df_minority = df[df.target==1]

# # Downsample majority class
# df_majority_downsampled = resample(df_majority, 
#                                  replace=False,    # sample without replacement
#                                  n_samples=80810,     # to match minority class
#                                  random_state=173) # reproducible results
# df_majority_downsampled.info()
# df = pd.concat([df_majority_downsampled,df_minority])

# Downsample majority class
df_majority_updampled = resample(df_minority, 
                                 replace=True,    # sample without replacement
                                 n_samples=1225312,     # to match minority class
                                 random_state=173) # reproducible results
df = pd.concat([df_majority_updampled,df_majority])

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
len(df)

In [None]:
df['target'].value_counts()

## Used gensim to do stopwords removal again. 
This is done because of the fact that the number of stopwords are less in nltk.

In [None]:
from gensim.parsing.preprocessing import remove_stopwords

df["question_text"] = df["question_text"].str.lower()
df_test['question_text'] = df_test['question_text'].str.lower()

df['question_text'] = df['question_text'].apply(remove_stopwords)
df_test['question_text'] = df_test['question_text'].apply(remove_stopwords)

df.head()

## Tokenization

In [None]:
from gensim.utils import simple_preprocess

# Tokenize the text column to get the new column 'tokenized_text'
df['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df['question_text']] 
df_test['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df_test['question_text']] 

print(df['tokenized_text'].head(10))

## Stemming the data

In [None]:
from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()
# Get the stemmed_tokens
# df['stemmed_tokens'] = [porter_stemmer.stem(word) for word in df['question_text']]
# df['stemmed_tokens'].head(10)

df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df['tokenized_text'] ]
df_test['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df_test['tokenized_text'] ]

df['stemmed_tokens'].head(10)

## TF-IDF Vectorizer

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X = df['stemmed_tokens']
X_TEST = df_test['stemmed_tokens']
y = df['target']

X_TFIDF = X.apply(lambda x : " ".join(x))
X_TEST_TFIDF = X_TEST.apply(lambda x : " ".join(x))

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_TFIDF)
X_test_tfidf = vectorizer.transform(X_TEST_TFIDF)


X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf,y, test_size=0.33,random_state=42)
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

## Implementing LinearSVC

In [None]:
from sklearn.svm import LinearSVC

In [None]:
clf = LinearSVC()

In [None]:
clf.fit(X_train,y_train)

In [None]:
predictions = clf.predict(X_test)
predictions

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
print(classification_report(y_test,predictions))

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_test,predictions)

In [None]:
test_prediction = clf.predict(X_test_tfidf)
test_prediction

In [None]:
df_result = pd.DataFrame({'qid':df_test['qid'].to_numpy(), 'prediction':test_prediction})
df_result

In [None]:
df_result.to_csv('submission.csv', index=False)