# Reddit posts sentiment analysis

In [None]:

import os
dir_Path = 'D:\\College\\NLP\\SentimentAnalysis'
os.chdir(dir_Path)

## Data Facts and Import 

In [None]:
import pandas as pd 
Postdata = pd.read_csv('RedditData.csv')

In [None]:
Postdata.shape

In [None]:
Postdata.head()

In [None]:
Postdata.info()

In [None]:
Postdata.describe()

## Data Cleaning / EDA

In [None]:
### Checking Missing values in the csv file

count = Postdata.isnull().sum().sort_values(ascending=False)
percentage = ((Postdata.isnull().sum()/len(Postdata)*100)).sort_values(ascending=False)
missing_data = pd.concat([count, percentage], axis=1,
keys=['Count','Percentage'])

print('Count and percentage of missing values for the columns:')

missing_data

In [None]:
### Checking for the Distribution of Category ###
import matplotlib.pyplot as plt
%matplotlib inline
print('Percentage for category\n')
print(round(Postdata.category.value_counts(normalize=True)*100,2))
round(Postdata.category.value_counts(normalize=True)*100,2).plot(kind='bar')
plt.title('Percentage Distributions by the category type')
plt.show()

In [None]:
Postdata.head(10)

In [None]:
# First level cleaning
import re
import string

#This function converts to lower-case, removes square bracket, removes numbers and punctuation
def text_clean_1(text):
    
    #Convert the text to lower
    text = text.lower()
    
    #Remove the brackets
    text = re.sub('\[.*?\]', '', text)
    
    #Remove the punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    #Remove the digits
    text = re.sub('\w*\d\w*', '', text)
    
    return text

cleaned1 = lambda x: text_clean_1(x)

In [None]:
# Updated text
Postdata['cleaned_post_data'] = pd.DataFrame(Postdata.redditpost.apply(cleaned1))
Postdata.head(10)

In [None]:
# Apply a second round of cleaning
def text_clean_2(text):
    
    #Replacing quotes with blanks
    text = re.sub('[‘’“”…]', '', text)
    
    #Replacing new lines with blank
    text = re.sub('\n', '', text)
    return text

cleaned2 = lambda x: text_clean_2(x)

In [None]:
# Updated text
Postdata['cleaned_data'] = pd.DataFrame(Postdata['cleaned_post_data'].apply(cleaned2))
Postdata.head(10)

## Model training 

In [None]:
from sklearn.model_selection import train_test_split

#Independent variable
Independent_var = Postdata.cleaned_data

#Target variable
Dependent_var = Postdata.category

IV_train, IV_test, DV_train, DV_test = train_test_split(Independent_var, Dependent_var, test_size = 0.1, random_state = 225)

print('IV_train :', len(IV_train))
print('IV_test  :', len(IV_test))
print('DV_train :', len(DV_train))
print('DV_test  :', len(DV_test))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tvec = TfidfVectorizer()
clf2 = LogisticRegression(solver = "lbfgs")


from sklearn.pipeline import Pipeline

In [None]:
model = Pipeline([('vectorizer',tvec),('classifier',clf2)])

model.fit(IV_train, DV_train)

predictions = model.predict(IV_test)

## Model prediciton 

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(predictions, DV_test))
print("Precision : ", precision_score(predictions, DV_test, average = 'weighted'))
print("Recall : ", recall_score(predictions, DV_test, average = 'weighted'))

## Testing the model

In [None]:
example = ["I am happy"]
result = model.predict(example)

print(result)
if(result == [1]):
    print("The sentiment is positive")
elif(result == [0]):
    print("The sentiment is neutral")
else:
    print("The sentiment is negative")
# -1 is negative sentiment
#  0 is neutral sentiment
#  1 is Positive sentiment