In [None]:
pip install nltk

In [3]:
import numpy as np
import pandas as pd
import langid
import gc

In [4]:
train_df = pd.read_csv("./AskReddit Dataset/train.csv")

In [5]:
train_df.head()

Unnamed: 0,qid,question_text,target
0,a3dee568776c08512c89,What is the role of Lua in Civ4?,0
1,bdb84f519e7b46e7b7bb,What are important chapters in Kannada for 10 ...,0
2,29c88db470e2eb5c97ad,Do musicians get royalties from YouTube?,0
3,3387d99bf2c3227ae8f1,What is the difference between Scaling Social ...,0
4,e79fa5038f765d0f2e7e,Why do elevators go super slow right before th...,0


In [6]:
test_df = pd.read_csv("./AskReddit Dataset/test.csv")

In [7]:
test_df.head()

Unnamed: 0,qid,question_text
0,0a824224322f0a36025f,Why is my fish tank so cloudy?
1,28af14c4e4777ce1273e,Are AAP supporters/leaders hypocrites?
2,6892a52c51103dd95044,Can you still get a ticket if you shut off you...
3,badd9e8886d73fc1fe4e,Why should any liberal or caring person want t...
4,4ef178f82a465e4804ae,How can I know who got into my PC using anydesk?


In [7]:
train_df['target'].value_counts()

0    612656
1     40405
Name: target, dtype: int64

### Imports needed

In [None]:
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer


## Approach 1 - Logistic Regression


### Preprocessing

In [None]:
vectorizer = CountVectorizer(max_features = 150000)

In [None]:
features = vectorizer.fit_transform(train_df['question_text'].tolist() + test_df['question_text'].tolist())

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(
       features[0:len(train_df)], 
       train_df['target'],
       train_size=0.85, 
       random_state=1234)


### Model

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(solver='liblinear')


In [None]:
log_model = log_model.fit(X=X_train, y=y_train)


In [None]:
y_pred = log_model.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
log_model = LogisticRegression(solver='liblinear')
log_model = log_model.fit(X=features[0:len(train_df)], y=train_df['target'])
test_pred = log_model.predict(features[len(train_df):])

In [None]:
output_qid = test_df["qid"].to_numpy()
output_pred_target= test_pred

list_of_tuples = list(zip(output_qid, output_pred_target))

data = {'qid': output_qid, 'target': output_pred_target}

df = pd.DataFrame(data)
df.to_csv('output_targets_lr1.csv', index=False)

In [None]:
df.head()


## Approach 2 - Logistic Regression

### Preprocessing


In [None]:
vectorizer = CountVectorizer(max_features = 12000)

In [None]:
features = vectorizer.fit_transform(train_df['question_text'].tolist() + test_df['question_text'].tolist())

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(
       features[0:len(train_df)], 
       train_df['target'],
       train_size=0.85, 
       random_state=1234)


In [None]:
import gc
gc.collect()

### Model

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(solver='liblinear')


In [None]:
log_model = log_model.fit(X=X_train, y=y_train)


In [None]:
y_pred = log_model.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
log_model = LogisticRegression(solver='liblinear')
log_model = log_model.fit(X=features[0:len(train_df)], y=train_df['target'])
test_pred = log_model.predict(features[len(train_df):])


In [None]:
output_qid = test_df["qid"].to_numpy()
output_pred_target= test_pred

list_of_tuples = list(zip(output_qid, output_pred_target))

data = {'qid': output_qid, 'target': output_pred_target}

df = pd.DataFrame(data)
df.to_csv('output_targets_lr2.csv', index=False)

In [None]:
df.head()


## Approach 3 - Multi Layer Perceptron

### Preprocessing

In [None]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = [stemmer.stem(item) for item in tokens]
    return(stemmed)


In [None]:
def tokenize(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return(stems)


In [None]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = 'english',
    max_features = 4000
)

In [None]:
features = vectorizer.fit_transform(train_df['question_text'].tolist() + test_df['question_text'].tolist())

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(
       features[0:len(train_df)], 
       train_df['target'],
       train_size=0.85, 
       random_state=1234)


In [None]:
import gc
gc.collect()

### Model

In [None]:
from sklearn.neural_network import MLPClassifier


In [None]:
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf = clf.fit(X=X_train, y=y_train)

In [None]:
y_pred = clf.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf = clf.fit(X=features[0:len(train_df)], y=train_df['target'])
test_pred = clf.predict(features[len(train_df):])

In [None]:
output_qid = test_df["qid"].to_numpy()
output_pred_target= test_pred

list_of_tuples = list(zip(output_qid, output_pred_target))

data = {'qid': output_qid, 'target': output_pred_target}

df = pd.DataFrame(data)
df.to_csv('output_targets_mlpc.csv', index=False)

In [None]:
df.head()