## Imports:

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score,GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import stop_words
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords

warnings.filterwarnings('ignore')

In [2]:
#Importing the data
asklove=pd.read_csv('./raw_df.csv')

## Exploring Data - EDA

In [3]:
asklove.head(0)

Unnamed: 0.1,Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp


In [4]:
#droping usless columns
asklove=asklove.drop(['Unnamed: 0','created_utc', 'author', 'num_comments', 'score','is_self','timestamp'], axis=1)

In [5]:
asklove.head(10)

Unnamed: 0,title,selftext,subreddit
0,Missed opportunities...,I want to take a moment to talk about missed o...,love
1,You ever wish you can marry you can marry your...,[removed],love
2,You ever fall inlove with your own cousin and ...,[removed],love
3,My Love is My Own,My love is my own.\n\nIt is most powerful and ...,love
4,The day my stalker became the love of my life....,Things can change in an instant. In one blink ...,love
5,Tinder Date to Relationship,"Well..English is not my mother tongue, so you ...",love
6,Just a way of expressing what I feel since the...,4th March 2019\nI cant describe what im feelin...,love
7,He missed me,Even tho he didn’t say so he initiated a lot m...,love
8,emotional problems and left Index finger pain?!,when someone you love say or did something tha...,love
9,guuyyyssss I’m confused she talked about going...,,love


In [6]:
#checking [removed] in our df
(asklove[(asklove['selftext']=='[removed]')]['selftext']).value_counts()

[removed]    985
Name: selftext, dtype: int64

In [7]:
#using fillna to fill non values '' and save it 
asklove.fillna('', inplace=True)

In [8]:
#using applymap lambda replacing [removed] with ' '
asklove=asklove.applymap(lambda x: x.replace('[removed]', ' '))

In [9]:
asklove.head(1)

Unnamed: 0,title,selftext,subreddit
0,Missed opportunities...,I want to take a moment to talk about missed o...,love


In [10]:
#combinging the two columns title and selftext and creatiing new column dada
asklove['data']=asklove['title']+asklove['selftext']

In [11]:
#droping the old columns 
asklove=asklove.drop(['title', 'selftext'],axis =1)

In [12]:
asklove.head()

Unnamed: 0,subreddit,data
0,love,Missed opportunities...I want to take a moment...
1,love,You ever wish you can marry you can marry your...
2,love,You ever fall inlove with your own cousin and ...
3,love,My Love is My OwnMy love is my own.\n\nIt is m...
4,love,The day my stalker became the love of my life....


In [13]:
# checking for null values
asklove.isna().sum()

subreddit    0
data         0
dtype: int64

In [14]:
asklove.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4975 entries, 0 to 4974
Data columns (total 2 columns):
subreddit    4975 non-null object
data         4975 non-null object
dtypes: object(2)
memory usage: 77.8+ KB


In [15]:
asklove.head()

Unnamed: 0,subreddit,data
0,love,Missed opportunities...I want to take a moment...
1,love,You ever wish you can marry you can marry your...
2,love,You ever fall inlove with your own cousin and ...
3,love,My Love is My OwnMy love is my own.\n\nIt is m...
4,love,The day my stalker became the love of my life....


In [16]:
#dummifiy subreddit column to make it 0 an 1 
column=['subreddit']
asklove=pd.get_dummies(asklove, columns=column, drop_first=True)

In [17]:
# Showing the first 10 comments.
asklove['data'][:10]

0    Missed opportunities...I want to take a moment...
1    You ever wish you can marry you can marry your...
2    You ever fall inlove with your own cousin and ...
3    My Love is My OwnMy love is my own.\n\nIt is m...
4    The day my stalker became the love of my life....
5    Tinder Date to RelationshipWell..English is no...
6    Just a way of expressing what I feel since the...
7    He missed meEven tho he didn’t say so he initi...
8    emotional problems and left Index finger pain?...
9    guuyyyssss I’m confused she talked about going...
Name: data, dtype: object

In [18]:
asklove.head()

Unnamed: 0,data,subreddit_love
0,Missed opportunities...I want to take a moment...,1
1,You ever wish you can marry you can marry your...,1
2,You ever fall inlove with your own cousin and ...,1
3,My Love is My OwnMy love is my own.\n\nIt is m...,1
4,The day my stalker became the love of my life....,1


## Pre-Processing - Model - Evaluate 

In [19]:
#assigning out data the title and content to x, and the subreddit_love to y.
X= asklove['data']
y= asklove['subreddit_love']
#train test split our x and y, 25%, 75% randome state 42, and stratify to yes becasue its clasification problem 
# and ensure that the train and test sets have approximately the same percentage of samples of each 
#target class as the complete set.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

## Logistic Regression

In [20]:
#using pipleline our code looks clean and organized. 
pipe = Pipeline([
    ('cvec', CountVectorizer()), #instenciate count vectorizer  
    ('lr', LogisticRegression()) #and logistic regression
])

In [21]:
#giving the preameters for count vectorizer
pipe_params = {
    # adjusting features to reduce variance 
    'cvec__max_features': [2000,2500, 3000,3500],
    # trying stopwords and none
    'cvec__stop_words': [None, 'english'],
    #ignore terms that appear in more than 2, 3,4 documents. trying to find the best fit 
    'cvec__min_df': [2,3,4],
    #ignore terms that appear in more than 50% of the documents and so on. trying to find the best one
    'cvec__max_df': [.5,.9 ,.95],
    # ngram (1,1) looks only one word and 1,2 looks that word with a word before and after 
    #so we try both to get the best peramtert
    'cvec__ngram_range': [(1,1), (1,2)]
}
asklove = GridSearchCV(pipe, param_grid=pipe_params, cv=3)#using Grid search fit and score our model
asklove.fit(X_train, y_train)
print(f' CV Logistic Regression:  {asklove.best_score_}')
asklove.best_params_ #using best_prams getting the best premeters and score

 CV Logistic Regression:  0.9321897614580541


{'cvec__max_df': 0.5,
 'cvec__max_features': 3500,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [22]:
#getting the accuracy on train and test data
print (f'CV Lr Train Score :{asklove.score(X_train, y_train)}') 
print (f'CV Lr Test Score  :{asklove.score(X_test, y_test)}')

CV Lr Train Score :0.9839185205038864
CV Lr Test Score  :0.9389067524115756


## Naive Bayes

In [23]:
#with pipeline count vectroize and naive bayes
pipe_nb = Pipeline([
    ('cvec', CountVectorizer()),#instenciate count vectorizer 
    ('nb', MultinomialNB()),#and logistic regression
])

In [24]:
#set the premeters
pipe_params_nb = {
    # adjusting features to reduce variance 
    'cvec__max_features': [2000,2500, 3000,3500, 4000,4500],
    # trying stopwords and none
    'cvec__stop_words': [None, 'english'],
    #ignore terms that appear in more than 2, 3,4 documents. trying to find the best fit 
    'cvec__min_df': [1,2,3,4],
    #ignore terms that appear in more than 20% of the documents and so on. trying to find the best one
    'cvec__max_df': [.2,.5,.9 ,.95],
    # ngram (1,1) looks only one word and 1,2 looks that word with a word before and after 
    #so we try both to get the best peramtert
    'cvec__ngram_range': [(1,1), (1,2)]
}
asklove = GridSearchCV(pipe_nb, param_grid=pipe_params, cv=3)#using Grid search fit and score our model
asklove.fit(X_train, y_train)
print(f'CV Naive Bayes: {asklove.best_score_}')
asklove.best_params_#using best_prams getting the best premeters and score

CV Naive Bayes: 0.9112838381131064


{'cvec__max_df': 0.5,
 'cvec__max_features': 3500,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [25]:
#priinting the scores for test and train data
print (f'CV nb Train Score :{asklove.score(X_train, y_train)}') 
print (f'CV nb Test Score  :{asklove.score(X_test, y_test)}')

CV nb Train Score :0.9265612436344144
CV nb Test Score  :0.9244372990353698


In [26]:
##trying to print the confusion matrix 
# from sklearn.metrics import confusion_matrix 

In [27]:
asklove.best_estimator_

Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=3500, min_df=2,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

## TFIDF- Lr

In [28]:
#instenciate TFIDF for tokanizing, trying diffrent way other steps are same as abovie 
tvec = TfidfVectorizer()

In [29]:
#using pipeline putting TFIDF and logistic regression in it for the next step 
pipe_tif_lr = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

In [30]:

pipe_params_tif_lr = {
    # trying stopwords and none
    'tvec__stop_words': [None, 'english'],
    # ngram (1,1) looks only one word and 1,2 looks that word with a word before and after 
    #so we try both to get the best peramtert
    'tvec__ngram_range': [(1,1), (1,2), (2,3)],
    #ignore terms that appear in more than 1, 2, 3,4 documents. trying to find the best fit 
    'tvec__min_df': [1,2,3,4],
    #ignore terms that appear in more than 20% of the documents and so on. trying to find the best one
    'tvec__max_df': [.2,.5,.9 ,.95],
    # adjusting features to reduce variance 
    'tvec__max_features': [2000,2500, 3000,3500, 4000,4500]
}
asklove = GridSearchCV(pipe_tif_lr, param_grid=pipe_params_tif_lr, cv=3)
asklove.fit(X_train, y_train)
print(asklove.best_score_)
asklove.best_params_

0.9289734655588314


{'tvec__max_df': 0.5,
 'tvec__max_features': 4000,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': None}

In [31]:
print (f'TIFDF Lr Train Score :{asklove.score(X_train, y_train)}') 
print (f'TIFDF Lr Test Score  :{asklove.score(X_test, y_test)}')

TIFDF Lr Train Score :0.960332350576253
TIFDF Lr Test Score  :0.9397106109324759


## TFIDF- nb

In [32]:
#expermentinig TFIDF and naive bayes
pipe_nb_tif = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB()),  
])

In [33]:
pipe_params_nb_tif = {
     # trying stopwords and none
    'tvec__stop_words': [None, 'english'],
    # ngram (1,1) looks only one word and 1,2 looks that word with a word before and after 
    #so we try both to get the best peramtert
    'tvec__ngram_range': [(1,1), (1,2), (2,3)],
    #ignore terms that appear in more than 1, 2, 3,4 documents. trying to find the best fit 
    'tvec__min_df': [1,2,3,4],
    #ignore terms that appear in more than 20% of the documents and so on. trying to find the best one
    'tvec__max_df': [.2,.5,.9 ,.95],
    # adjusting features to reduce variance 
    'tvec__max_features': [2000,2500, 3000,3500, 4000,4500]
    
    }
asklove = GridSearchCV(pipe_nb_tif, param_grid=pipe_params_nb_tif, cv=3)
asklove.fit(X_train, y_train)
print(asklove.best_score_)
asklove.best_params_

0.9203966764942375


{'tvec__max_df': 0.5,
 'tvec__max_features': 3500,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': None}

In [34]:
print (f'TIFDF nb Train Score :{asklove.score(X_train, y_train)}') 
print (f'TIFDF nb Test Score  :{asklove.score(X_test, y_test)}')

TIFDF nb Train Score :0.9407665505226481
TIFDF nb Test Score  :0.9236334405144695
