In [None]:
from nltk import tokenize
from operator import itemgetter
import math

### Get response from the webpage form

In [None]:
response = 'I feel very happy when I am working on my favourite projects with a capable team'

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english'))

In [None]:
total_sentences = tokenize.sent_tokenize(doc)
total_sent_len = len(total_sentences)
print(total_sent_len)

In [None]:
tf_score = {}
for each_word in total_words:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in tf_score:
            tf_score[each_word] += 1
        else:
            tf_score[each_word] = 1

# Dividing by total_word_length for each dictionary element
tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
print(tf_score)

In [None]:
def check_sent(word, sentences): 
    final = [all([w in x for w in word]) for x in sentences] 
    sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
    return int(len(sent_len))

In [None]:
idf_score = {}
for each_word in total_words:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in idf_score:
            idf_score[each_word] = check_sent(each_word, total_sentences)
        else:
            idf_score[each_word] = 1

# Performing a log and divide
idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())

print(idf_score)

In [None]:
tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
print(tf_idf_score)

In [None]:
def get_top_n(dict_elem, n):
    result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
    return result

### Get top n words

In [None]:
print(get_top_n(tf_idf_score, 5))

### Extract sentiment

In [None]:
w = nltk.corpus.shakespeare.words()
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()]
stopwords = nltk.corpus.stopwords.words("english")
words = [w for w in words if w.lower() not in stopwords]

In [None]:
from pprint import pprint

text = """
For some quick analysis, creating a corpus could be overkill.
If all you need is a word list,
there are simpler ways to achieve that goal."""
pprint(nltk.word_tokenize(text), width=79, compact=True)

### Train model on twitter dataset

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from nltk.corpus import stopwords

In [None]:
# To see complete width of tweet column

pd.set_option("display.max_colwidth", 200)

#Reading the dataset

df=pd.read_csv('twitter_train.csv')
df_test=pd.read_csv('twitter_test.csv')

# To see first 5 rows of dataset

df.head()

In [None]:
tweet_len = df['tweet'].str.len()
test_tweet_len=df_test['tweet'].str.len()
plt.hist(tweet_len,bins=20,label='train')
plt.hist(test_tweet_len,bins=20,label='test')
plt.legend()
plt.show()

In [None]:
df['nice_tweet']=0
for i in range(len(df['tweet'])):
    #r=re.findall('@[a-zA-Z]*', df['tweet'][i])
    #for j in r:
    df['nice_tweet'][i]=re.sub('@[a-zA-Z]*','',df['tweet'][i])
    
df_test['nice_tweet']=0
for i in range(len(df_test['tweet'])):
    #r=re.findall('@[a-zA-Z]*', df['tweet'][i])
    #for j in r:
    df_test['nice_tweet'][i]=re.sub('@[a-zA-Z]*','',df_test['tweet'][i])

In [None]:
for i in range(len(df['nice_tweet'])):
    df['nice_tweet'][i]=re.sub('[^a-zA-Z#\s]*','',df['nice_tweet'][i])
    
for i in range(len(df_test['nice_tweet'])):
    df_test['nice_tweet'][i]=re.sub('[^a-zA-Z#\s]*','',df_test['nice_tweet'][i])

In [None]:
df['nice_tweet'] = df['nice_tweet'].apply(lambda x : ' '.join([w for w in x.split() if len(w)>3]))
df_test['nice_tweet'] = df_test['nice_tweet'].apply(lambda x : ' '.join([w for w in x.split() if len(w)>3]))

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

df['nice_tweet'] = df['nice_tweet'].apply(lambda x : x.split())
df['nice_tweet'] = df['nice_tweet'].apply(lambda y :  [lemmatizer.lemmatize(w) for w in y])
df_test['nice_tweet'] = df_test['nice_tweet'].apply(lambda x : x.split())
df_test['nice_tweet'] = df_test['nice_tweet'].apply(lambda y :  [lemmatizer.lemmatize(w) for w in y])

for i in range(len(df['nice_tweet'])):    
    df['nice_tweet'][i] = ' '.join(df['nice_tweet'][i])

for i in range(len(df_test['nice_tweet'])):    
    df_test['nice_tweet'][i] = ' '.join(df_test['nice_tweet'][i])

In [None]:
stop_words = set(stopwords.words('english'))

df['nice_tweet'] = df['nice_tweet'].apply(lambda x : ' '.join([w for w in x.split() if not w in stop_words]))
df_test['nice_tweet'] = df_test['nice_tweet'].apply(lambda x : ' '.join([w for w in x.split() if not w in stop_words]))

In [None]:
# Bag of words
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
count_vectorizer = CountVectorizer(max_df = 0.9 , min_df = 2 , stop_words = 'english' , max_features = 1000)
dff = count_vectorizer.fit_transform(df['nice_tweet'])

tfidf_vectorizer = TfidfVectorizer(max_df = 0.9 , min_df = 2 , stop_words = 'english' , max_features = 1000)
dfff = tfidf_vectorizer.fit_transform(df['nice_tweet'])

In [None]:
train_x = dff[:20000 , :]
test_x = dff[20000: , :]

train_y = df['label'][:20000]
test_y = df['label'][20000:]

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_x , train_y)
pred = lr.predict(test_x)

from sklearn import metrics
metrics.confusion_matrix(pred , test_y)

In [None]:
metrics.accuracy_score(pred , test_y)

In [None]:
print(metrics.classification_report(pred , test_y))

In [None]:
from sklearn.preprocessing import binarize
y_prob=lr.predict_proba(test_x)[:,1]
pred1=binarize([y_prob],0.15)[0]
print(metrics.confusion_matrix(pred1,test_y))

In [None]:
print(metrics.classification_report(pred1 , test_y))

### Use trained model on user's response

In [None]:
satisfaction_quotient = (lr.predict_proba([response])[0] - 0.5) * 100
print(satisfaction_quotient)