In [2]:
import pandas as pd
import numpy as np

In [3]:
train_data=pd.read_csv(r"C:\Users\light\Desktop\train.csv", encoding='ISO-8859-1')

In [4]:
import nltk
from nltk.tokenize import word_tokenize

def most_used_words(train_data):
    tokens = word_tokenize(train_data)
    frequency_dist = nltk.FreqDist(tokens)
    print("There is %d different words" % len(set(tokens)))
    return sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)

In [5]:
from nltk.corpus import stopwords

mw = most_used_words(train_data.SentimentText.str.cat())
most_words = []
for w in mw:
    if len(most_words) == 1000:
        break
    if w in stopwords.words("english"):
        continue
    else:
        most_words.append(w)

There is 133899 different words


In [10]:

from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

def stem_tokenize(train_data):
    stemmer = SnowballStemmer("english")
    stemmer = WordNetLemmatizer()
    return [stemmer.lemmatize(token) for token in word_tokenize(train_data)]

def lemmatize_tokenize(train_data):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in word_tokenize(train_data)]

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
train_data.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

In [13]:
from sklearn.model_selection import train_test_split

sentiments = train_data['Sentiment']
tweets = train_data['SentimentText']

vectorizer = TfidfVectorizer(tokenizer=lemmatize_tokenize, ngram_range=(2,2))
learn_data, test_data, sentiments_learning, sentiments_test = train_test_split(tweets, sentiments, test_size=0.3)
learning_data = vectorizer.fit_transform(learn_data)

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

lr = LogisticRegression()
bnb = BernoulliNB()
mnb = MultinomialNB()

models = {
    'logitic regression': lr,
    'bernoulliNB': bnb,
    'multinomialNB': mnb,
}
for model in models.keys():
    scores = cross_val_score(models[model], learning_data, sentiments_learning, scoring="f1", cv=10)
    print("===", model, "===")
    print("scores = ", scores)
    models[model].fit(learning_data, sentiments_learning)
    print("score on the learning data (accuracy) = ", accuracy_score(models[model].predict(learning_data), sentiments_learning))
    print("")



=== logitic regression ===
scores =  [0.78482446 0.78675799 0.7940707  0.78263834 0.78083751 0.78616924
 0.78679224 0.78261859 0.79032626 0.78760359]
score on the learning data (accuracy) =  0.889444507943765

=== bernoulliNB ===
scores =  [0.769349   0.76956904 0.76953686 0.77186774 0.76544028 0.77011115
 0.76733707 0.76466711 0.77263714 0.76469937]
score on the learning data (accuracy) =  0.913447251114413

=== multinomialNB ===
scores =  [0.79716184 0.79781793 0.7943553  0.79375141 0.79392083 0.7977908
 0.79384128 0.78976565 0.80104831 0.79222551]
score on the learning data (accuracy) =  0.9563664418790719



In [15]:
mnb.fit(learning_data, sentiments_learning)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
testing_data = vectorizer.transform(test_data)
mnb.score(testing_data, sentiments_test)

0.7404740474047404

In [17]:
model = MultinomialNB()
model.fit(learning_data, sentiments_learning)
tweet = pd.Series([input(),])
tweet = vectorizer.transform(tweet)
proba = model.predict_proba(tweet)[0]
print("The probability that this tweet is sad is:", proba[0])
print("The probability that this tweet is happy is:", proba[1])

hey let's go for fun
The probability that this tweet is sad is: 0.38213385785027726
The probability that this tweet is happy is: 0.617866142149722
