In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!apt-get install zip

# Preparation

In [None]:
train_csv_data = '../input/quora-insincere-questions-classification/train.csv'
test_csv_data = '../input/quora-insincere-questions-classification/test.csv'

In [None]:
#import the libraries
import matplotlib.pyplot as plt
import string
import nltk
import re
import seaborn as sns
from unidecode import unidecode
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.utils import resample

In [None]:

print(os.getcwd())

In [None]:
#use pandas to get data from csv
df_train = pd.read_csv(train_csv_data)

df_train.head(5)


In [None]:
df_test = pd.read_csv(test_csv_data)
df_test.head(5)

In [None]:
#drop NA data
df_train.dropna(inplace=True)

In [None]:
#get number of labels'values
df_train['target'].value_counts()

In [None]:
#insincere which target is 1 and sincere is 0
insincere_data = df_train[df_train['target'] == 1]
sincere_data = df_train[df_train['target'] == 0]


In [None]:
#get the percentage of sincere and insincere

y = df_train['target']
y.value_counts().plot(kind='bar', rot=0)

In [None]:
from sklearn.utils import resample

#under sampling the data
sincere = df_train[df_train.target == 0]
insincere = df_train[df_train.target == 1]
df_train_sampled = pd.concat([resample(sincere, replace = True, n_samples = len(insincere)*4), insincere])
df_train_sampled

In [None]:
#data after under sampling
y = df_train_sampled['target']
y.value_counts().plot(kind='bar', rot=0)

# Preprocess data

In [None]:
# get stopwords, punkt, wordnet of english from nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

In [None]:
#define badwords
bad_words = "2 girls 1 cup, 2g1c, 4r5e, 5h1t, 5hit, a$$"
bad_words = [x.strip() for x in bad_words.split(",")]

In [None]:
#define acronyms
ACRONYMS = {
    "aren't" : "are not", "can't" : "cannot", "cant": "cannot", "couldn't" : "could not", "didn't" : "did not", "doesn't" : "does not", "don't" : "do not", "hadn't" : "had not", "hasn't" : "has not", "haven't" : "have not", 
    "he'd" : "he would", "he'll" : "he will", "he's" : "he is", 
    "i'd" : "I would", "i'd" : "I had", "i'll" : "I will", "i'm" : "I am", "isn't" : "is not", 
    "it's" : "it is", "it'll":"it will", "i've" : "I have", "let's" : "let us", 
    "mightn't" : "might not", "mustn't" : "must not", "shan't" : "shall not", 
    "she'd" : "she would", "she'll" : "she will", "she's" : "she is", 
    "shouldn't" : "should not", "that's" : "that is", "there's" : "there is", 
    "they'd" : "they would", "they'll" : "they will", "they're" : "they are", "they've" : "they have", "we'd" : "we would", "we're" : "we are", "weren't" : "were not", "we've" : "we have", 
    "what'll" : "what will", "what're" : "what are", "what's" : "what is", "what've" : "what have",
    "where's" : "where is", "who'd" : "who would", "who'll" : "who will", "who're" : "who are", "who's" : "who is", "who've" : "who have",
    "won't" : "will not", "wouldn't" : "would not", 
    "you'd" : "you would", "you'll" : "you will", "you're" : "you are", "you've" : "you have", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll've": "you will have",
    "'re": " are", "wasn't": "was not", "we'll":" will", "didn't": "did not", "tryin'":"trying"
}

In [None]:
'''
To correct acronym or mispell word on text
'''
def correct_acronym(text):
    tokens = word_tokenize(text)
    tokens = [ACRONYMS.get(token) if (ACRONYMS.get(token) != None) else token for token in tokens]
    text = " ".join(tokens)

    
'''
Remove stopwords which appear from nltk stopwords
'''
def remove_stopword(text):
    tokens = word_tokenize(text)

    tokens_without_sw = [word for word in tokens if not word in stop_words]
    text = (' ').join(tokens_without_sw)

In [None]:
'''
preprocess data with nomalize text, remove url, puntk, emal, sign, number
'''
def preprocess(text):
    text = unidecode(text).encode("ascii")
    text = str(text, "ascii")
    
    #remove bad word
    for word in bad_words:
        text = text.replace(word, "BAD WORDS")
        
    text = text.lower() #normalize
    text = re.sub('https?://\S+|www\.\S+', ' ', text) #remove url
    text = re.sub('<.*?>+', '', text) #remove special character
    text = re.sub('\S+@\S+', ' ', text) #remove email
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) #remove sign
    text = re.sub('(.)\1+', '\1', text)
    text = re.sub('\d+', ' ', text) #remove number
    
    tokens = word_tokenize(text)
    tokens = [ACRONYMS.get(token) if (ACRONYMS.get(token) != None) else token for token in tokens]
    text = " ".join(tokens)
    

    tokens_without_sw = [word for word in tokens if not word in stop_words]
    text = (' ').join(tokens_without_sw)
    return text

In [None]:
#create new column of question_text column after preprocess
df_train_sampled['question_text_preprocess'] = df_train_sampled['question_text'].apply(preprocess)
df_train_sampled.head(5)

In [None]:
from wordcloud import WordCloud

In [None]:
#using wordcloud to know the words which have most prequence or importance of sincere
sincere = sincere_wordcloud = WordCloud(width=800, height=600, background_color='white', min_font_size=10).generate(str(df_train_sampled[df_train_sampled["target"] == 0]["question_text_preprocess"]))
plt.figure(figsize=(8, 8))
plt.imshow(sincere_wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
#using wordcloud to know the words which have most prequence or importance of insincere
sincere = sincere_wordcloud = WordCloud(width=800, height=600, background_color='white', min_font_size=10).generate(str(df_train_sampled[df_train_sampled["target"] == 1]["question_text_preprocess"]))
plt.figure(figsize=(8, 8))
plt.imshow(sincere_wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from time import time

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score


# Vectorize data

In [None]:
#caculate the weight of words
cvec = CountVectorizer()
cvec.fit(df_train_sampled.question_text)

#transform train data to weight matrix
sin_doc_matrix = cvec.transform(df_train_sampled[df_train_sampled.target == 0].question_text_preprocess)
insin_doc_matrix = cvec.transform(df_train_sampled[df_train_sampled.target == 1].question_text_preprocess)

sin_tf = np.sum(sin_doc_matrix,axis=0)
insin_tf = np.sum(insin_doc_matrix,axis=0)

sin = np.squeeze(np.asarray(sin_tf))
insin = np.squeeze(np.asarray(insin_tf))

term_freq_df = pd.DataFrame([sin, insin],
                            columns=cvec.get_feature_names()).transpose()

In [None]:
term_freq_df.columns = ['sincere', 'insincere']
term_freq_df['total'] = term_freq_df['sincere'] + term_freq_df['insincere']

#the most words appear in both label
term_freq_df.sort_values(by='total', ascending=False).iloc[:10]

In [None]:
#show the top 10 insincere
y_pos = np.arange(10)
plt.figure(figsize=(20,10))
plt.bar(y_pos, term_freq_df.sort_values(by='insincere', ascending=False)['insincere'][:10], align='center', alpha=0.5)
plt.xticks(y_pos, term_freq_df.sort_values(by='sincere', ascending=False)['sincere'][:10].index,rotation='vertical')
plt.ylabel('Frequency')
plt.xlabel('Top 10 insincere tokens')
plt.title('Top 10 tokens in insincere tweets')

In [None]:
#show the top 10 sincere
y_pos = np.arange(10)
plt.figure(figsize=(20,10))
plt.bar(y_pos, term_freq_df.sort_values(by='sincere', ascending=False)['insincere'][:10], align='center', alpha=0.5)
plt.xticks(y_pos, term_freq_df.sort_values(by='sincere', ascending=False)['insincere'][:10].index,rotation='vertical')
plt.ylabel('Frequency')
plt.xlabel('Top 10 sincere tokens')
plt.title('Top 10 tokens in sincere tweets')

In [None]:
#the prequent appearance of both insincere and sincere
plt.figure(figsize=(8,6))
ax = sns.regplot(x="sincere", y="insincere",fit_reg=False, scatter_kws={'alpha':0.5},data=term_freq_df)
plt.ylabel('Insincere Frequency')
plt.xlabel('Sincere Frequency')
plt.title('Sincere Frequency vs Insincere Frequency')

In [None]:
from sklearn.preprocessing import LabelBinarizer
label_target = LabelBinarizer(sparse_output=True)
train_target= label_target.fit_transform(df_train_sampled['target'])

train_target.shape

In [None]:
#devide train data to 3 part: train, validation and test

x = df_train_sampled.question_text_preprocess
y = df_train_sampled.target
from sklearn.model_selection import train_test_split
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)
print("Train set has total {0} entries with \n {1:.2f}% sincere, {2:.2f}% insincere".format(
    len(x_train), 
    (len(x_train[y_train == 0]) / (len(x_train)*1.))*100, 
    (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with \n {1:.2f}% sincere, {2:.2f}% insincere".format(
    len(x_validation), 
    (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100, 
    (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with \n {1:.2f}% sincere, {2:.2f}% insincere".format(
    len(x_test),
    (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
    (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

In [None]:
#but in here I just train with train and test with validation and test
cvec.fit(x_train)

vt_x_train = cvec.transform(x_train)
vt_x_test = cvec.transform(x_validation_and_test)



# Train model

In [None]:
#train model
count_vectorizer = LogisticRegression(n_jobs=10, solver='saga', C=0.1, verbose=1)

count_vectorizer.fit(vt_x_train, y_train)

y_prediction_count_vectorizer = count_vectorizer.predict(vt_x_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
#the results
print("results\n")
print(classification_report(y_validation_and_test, y_prediction_count_vectorizer))

# Prediction

In [None]:
#prediction with test data
df_test['clean_questions'] = df_test['question_text'].apply(preprocess)
test_vt_x = cvec.transform(df_test['clean_questions'])
predictions_test_data = count_vectorizer.predict(test_vt_x)

# Results and submission

In [None]:
#submission
df_test['prediction'] = predictions_test_data
submissions = df_test[['qid', 'prediction']]
submissions

In [None]:
submissions.to_csv('submission.csv', index=False)