# Problem Statement

1. The json file contains the **stocktwit data**, the **timestamp** of collecting the tweet and the ticker(stock tdentifier).
2. The tagged **sentiment ranges from 0-3**.

Build a model to predict the sentiment of the stocktwit
* The json files have a structure as follows:

* { ‘records’: [

* {

* 'stocktwit_tweet': ‘$TSLA is a definite buy today’,

* 'sentiment_score': ‘3’,

* 'timestamp': ‘2018-07-01 00:00:09+00:00’,

* 'ticker': ‘TSLA’

* },

* {..},

* {..}

* ]

* }

In [None]:
#Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

In [None]:
#Do not display warnings in notebook 
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)
pd.options.display.max_seq_items = 2000

import sys

np.set_printoptions(threshold=sys.maxsize)

In [None]:
# Importing the json file
J_tweets = pd.read_json ("../input/financial-data/train_data_JSON.json")
# df = pd.read_json(filename)
# df.head()

In [None]:
# Importing the json file
JSON_tweets_test = pd.read_json("../input/financial-data/test_data.json")

In [None]:
# Print the shape of the tweeets data
J_tweets.records.shape

In [None]:
JSON_tweets_test.records.shape

In [None]:
# Print the first json record
J_tweets.records[0]

In [None]:
JSON_tweets_test.records[0]

In [None]:
# Convert the json data into dataframe
from pandas.io.json import json_normalize

In [None]:
tweets_data = json_normalize(J_tweets.records)

In [None]:
tweets_data1 = json_normalize(JSON_tweets_test.records)

In [None]:
# Save the dataframe into a csv file
tweets_data.to_csv("tweets.csv",index=False)
tweets_data1.to_csv("tweets1.csv",index=False)

## Exploring the data

In [None]:
# Import the tweets csv file
tweets = pd.read_csv("tweets.csv")
tweets1 = pd.read_csv("tweets.csv")

In [None]:
tweets.skew(), tweets.kurt()

In [None]:
tweets.sentiment_score.value_counts()

In [None]:
Sentiment_count = tweets['sentiment_score'].value_counts()
plt.figure(figsize=(10,4))
sns.barplot(Sentiment_count.index, Sentiment_count.values, alpha=0.8,)
plt.ylabel("COUNT")
plt.xlabel("sentiment_score")
plt.title('sentiment_score counts across the text data', loc='Center', fontsize=19)
plt.show()

In [None]:
tweets.head(5)

In [None]:
tweets1.head(5)

In [None]:
#Number of words in train data
tweets['word_count'] = tweets['stocktwit_tweet'].apply(lambda x: len(str(x).split(" ")))
tweets[['stocktwit_tweet','word_count']].head()

In [None]:
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [None]:
#Number of words in testdata
tweets1['word_count'] = tweets1['stocktwit_tweet'].apply(lambda x: len(str(x).split(" ")))
tweets1[['stocktwit_tweet','word_count']].head()

In [None]:
tweets['word_count'].iplot(
    kind='hist',
    bins=100,
    xTitle='word count',
    linecolor='black',
    yTitle='count',
    title='Review Text Word Count Distribution')
plt.show()

In [None]:
# Avg word length
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

tweets['avg_word'] = tweets['stocktwit_tweet'].apply(lambda x: avg_word(x))
tweets[['stocktwit_tweet','avg_word']].head()

In [None]:
tweets['avg_word'].iplot(
    kind='hist',
    bins=100,
    xTitle='word count',
    linecolor='black',
    yTitle='count',
    title='Review Text Word Count Distribution')
plt.show()

In [None]:

# Avg word length in test data
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

tweets1['avg_word'] = tweets1['stocktwit_tweet'].apply(lambda x: avg_word(x))
tweets1[['stocktwit_tweet','avg_word']].head()

In [None]:
tweets1['avg_word'].iplot(
    kind='hist',
    bins=100,
    xTitle='word count',
    linecolor='black',
    yTitle='count',
    title='Review Text Word Count Distribution')
plt.show()

In [None]:
# Number of special characters in train data
tweets['hastags'] = tweets['stocktwit_tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
tweets[['stocktwit_tweet','hastags']].head()

In [None]:
# Number of special characters in test data
tweets1['hastags'] = tweets1['stocktwit_tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
tweets1[['stocktwit_tweet','hastags']].head()

In [None]:
# Number of numerics in train data
tweets['numerics'] = tweets['stocktwit_tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
tweets[['stocktwit_tweet','numerics']].head()

In [None]:
# Number of numerics in test data
tweets1['numerics'] = tweets1['stocktwit_tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
tweets1[['stocktwit_tweet','numerics']].head()

In [None]:
# Number of uppercase words in train data
tweets['upper'] = tweets['stocktwit_tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
tweets[['stocktwit_tweet','upper']].head()

In [None]:
# Number of uppercase words in test data
tweets1['upper'] = tweets1['stocktwit_tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
tweets1[['stocktwit_tweet','upper']].head()

# Timestamp

In [None]:
# Conversion in train_data
tweets['Dates'] = pd.to_datetime(tweets['timestamp']).dt.date
tweets['Time'] = pd.to_datetime(tweets['timestamp']).dt.time

In [None]:
# Conversion in test_data
tweets1['Dates'] = pd.to_datetime(tweets1['timestamp']).dt.date
tweets1['Time'] = pd.to_datetime(tweets1['timestamp']).dt.time

In [None]:
days = {0:'Mon',1:'Tues',2:'Weds',3:'Thurs',4:'Fri',5:'Sat',6:'Sun'}

tweets["Dateoftweet"]=pd.to_datetime(tweets["Dates"])
tweets["Day"]=tweets["Dateoftweet"].dt.day
tweets["month"]=tweets["Dateoftweet"].dt.month
tweets["year"]=tweets["Dateoftweet"].dt.month
tweets["dayOftheweek"]=tweets["Dateoftweet"].dt.dayofweek

tweets['dayOftheweek'] = tweets['dayOftheweek'].apply(lambda x: days[x])

In [None]:
tweets.drop(['timestamp'],axis=1,inplace=True)

In [None]:
tweets.head()

In [None]:
tweets.dtypes

In [None]:
Tweet_Day_Count = tweets['Day'].value_counts()
plt.figure(figsize=(10,4))
sns.barplot(Tweet_Day_Count.index, Tweet_Day_Count.values, alpha=0.8)
plt.ylabel("Number Of Tweet")
plt.xlabel("Tweets By Days")
plt.title('Total tweets count by Day', loc='Center', fontsize=14)
plt.show()

# Stopwords

In [None]:
# Number of stop words in train_data
from nltk.corpus import stopwords
stop = stopwords.words('english')

tweets['stopwords'] = tweets['stocktwit_tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
tweets[['stocktwit_tweet','stopwords']].head()

In [None]:
# Number of characters
tweets1['char_count'] = tweets1['stocktwit_tweet'].str.len() ## this also includes spaces
tweets1[['stocktwit_tweet','char_count']].head()

In [None]:
# Print the shape of the dataframe(train_data)
tweets.shape

In [None]:
# Print the shape of the dataframe(test_data)
tweets1.shape

In [None]:
tweets.head(5)

In [None]:
Sentiment_count = tweets['sentiment_score'].value_counts()
plt.figure(figsize=(10,4))
sns.barplot(Sentiment_count.index, Sentiment_count.values, alpha=0.8)
plt.ylabel("Count")
plt.xlabel("sentiment Count")
plt.title('sentiment counts across the tweet data', loc='Center', fontsize=14)
plt.show()

In [None]:
tweets1.head(5)

In [None]:
## Checking the null values in the data

In [None]:
tweets.info()

In [None]:
tweets1.info()

In [None]:
### find the sentiment score for train_data
tweets.sentiment_score.value_counts()

# Ticker

In [None]:
# Preprocess the tweets data of ticker feature (train_data)
tweets['ticker']=tweets['ticker'].apply(lambda x:x.lower().replace('$',''))
tweets['ticker'] = tweets['ticker'].apply(lambda x: '$'+x)
len(tweets['ticker'].str.lower().unique())

In [None]:
# Preprocess the tweets data of ticker feature(test_data)
tweets1['ticker']=tweets1['ticker'].apply(lambda x:x.lower().replace('$',''))
tweets1['ticker'] = tweets1['ticker'].apply(lambda x: '$'+x)
len(tweets1['ticker'].str.lower().unique())

In [None]:
top_ticker = tweets.ticker.value_counts()[:10]
top_ticker

In [None]:
plt.figure(figsize=(15,12))
sns.barplot(top_ticker.index, top_ticker.values, alpha=0.8)
plt.ylabel("Count")
plt.xlabel("Frequent words")
plt.title('frequency of Ticker', loc='Center', fontsize=14)
plt.show()

In [None]:
# Plot the graph between the sentiment_score and the count(train_data)
Sentiment_count = tweets['sentiment_score'].value_counts()
plt.figure(figsize=(12,8))
sns.barplot(Sentiment_count.index, Sentiment_count.values, alpha=0.8)
plt.ylabel("COUNT")
plt.xlabel("sentiment_score")
plt.title('sentiment_score counts across the text data', loc='Center', fontsize=19)
plt.show()

In [None]:
# Import the libraries
import nltk
import re
import pandas as pd

from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
tweets.head(5)

In [None]:
tweets1.head(5)

## Remove non letters

In [None]:
# Removing non_letters(train_data)
tweets["stocktwit_tweet"]=tweets["stocktwit_tweet"].apply(lambda x:re.sub("[^A-Za-z]", " ", x.strip()))

In [None]:
# Removing non_letters-(test_data) 
tweets1["stocktwit_tweet"]=tweets1["stocktwit_tweet"].apply(lambda x:re.sub("[^A-Za-z]", " ", x.strip()))

# Convert into Lower Case

In [None]:
# Converting into Lower Cases- (train_data)
tweets['stocktwit_tweet'] = tweets['stocktwit_tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [None]:
# Converting into Lower Cases- (test_data)
tweets1['stocktwit_tweet'] = tweets1['stocktwit_tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))

## Removing Numbers for dataset

In [None]:
# Removing Numbers-train_data
tweets['stocktwit_tweet'] = tweets['stocktwit_tweet'].str.replace('[\d]', '')

In [None]:
# Removing Numbers- (test_data)
tweets1['stocktwit_tweet'] = tweets1['stocktwit_tweet'].str.replace('[\d]', '')

## Removing Punctuational marks-()

In [None]:
# Removing Punctuational marks- (train_data)
tweets['stocktwit_tweet'] = tweets['stocktwit_tweet'].str.replace('[^\w\s]','')

In [None]:
# Removing Punctuational marks- (test_data)
tweets1['stocktwit_tweet'] = tweets1['stocktwit_tweet'].str.replace('[^\w\s]','')

## Removing Stop words

In [None]:
# Removing Stop words- (train_data)
stop = stopwords.words('english')
tweets['stocktwit_tweet'] = tweets['stocktwit_tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [None]:
# Common word removal (train_data)
freq = pd.Series(' '.join(tweets['stocktwit_tweet']).split()).value_counts()[:10]
print(freq)

# Remove these words as their presence will not of any use in classification of our text data.
freq = list()
tweets['stocktwit_tweet'] = tweets['stocktwit_tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
tweets['stocktwit_tweet'].head()

In [None]:
# Common word removal(test_data)
freq = pd.Series(' '.join(tweets1['stocktwit_tweet']).split()).value_counts()[:10]
print(freq)

In [None]:
# Remove these words as their presence will not of any use in classification of our text data.
freq = list()
tweets1['stocktwit_tweet'] = tweets1['stocktwit_tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
tweets1['stocktwit_tweet'].head()

In [None]:
# Rare words removal (train_data)
freq = pd.Series(' '.join(tweets['stocktwit_tweet']).split()).value_counts()[-10:]
print(freq)

freq = list(freq.index)
tweets['stocktwit_tweet'] = tweets['stocktwit_tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
tweets['stocktwit_tweet'].head()

In [None]:
# Rare words removal (test_data)
freq = pd.Series(' '.join(tweets1['stocktwit_tweet']).split()).value_counts()[-10:]
print(freq)

freq = list(freq.index)
tweets1['stocktwit_tweet'] = tweets1['stocktwit_tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
tweets1['stocktwit_tweet'].head()

## Word Frequency

In [None]:
# Word Frequency- train_data
Word_freq = pd.Series(' '.join(tweets['stocktwit_tweet']).split()).value_counts()[:10]
Word_freq

In [None]:
# Word Frequency- test_data
Word_freq = pd.Series(' '.join(tweets1['stocktwit_tweet']).split()).value_counts()[:10]
Word_freq

In [None]:
# Word Frequency Plot - 
plt.figure(figsize=(10,4))
sns.barplot(Word_freq.index, Word_freq.values, alpha=0.8)
plt.ylabel("Count")
plt.xlabel("Frequent words")
plt.title('Frequent words in review_data', loc='Center', fontsize=14)
plt.show()

In [None]:
# Word Frequency Plot - (test_data)
plt.figure(figsize=(10,4))
sns.barplot(Word_freq.index, Word_freq.values, alpha=0.8)
plt.ylabel("Count")
plt.xlabel("Frequent words")
plt.title('Frequent words in review_data', loc='Center', fontsize=14)
plt.show()

In [None]:
# Stemming (train_data)
st = PorterStemmer()
tweets["stocktwit_tweet"] = tweets["stocktwit_tweet"].apply(lambda x: " ".join([st.stem(word)
                                                                   for word in x.split()]))

In [None]:
# Stemming (test_data)
st = PorterStemmer()
tweets1["stocktwit_tweet"] = tweets1["stocktwit_tweet"].apply(lambda x: " ".join([st.stem(word)
                                                                   for word in x.split()]))

In [None]:
 nltk.download('wordnet')

## Lemmatization

In [None]:
# # Lemmatization (train_data)
Lem = WordNetLemmatizer()
tweets["stocktwit_tweet"] = tweets["stocktwit_tweet"].apply(lambda x: " ".join([Lem.lemmatize(word)
                                                          for word in x.split()]))

In [None]:
# # Lemmatization (test_data)
Lem = WordNetLemmatizer()
tweets1["stocktwit_tweet"] = tweets1["stocktwit_tweet"].apply(lambda x: " ".join([Lem.lemmatize(word)
                                                           for word in x.split()]))

In [None]:
tweets.head(5)

In [None]:
tweets1.head(5)

In [None]:
# Splitting into train and test
X_train,X_test,Y_train,Y_test = train_test_split(tweets['stocktwit_tweet'],
                                                 tweets['sentiment_score'],
                                                 test_size=0.25,
                                                 random_state=7)

In [None]:

# Print the shape of train and test
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

In [None]:
# Function for cleaning the text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'@[a-zA-Z0-9_]+', '', text)   
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)   
    text = re.sub(r'www.[^ ]+', '', text)  
    text = re.sub(r'[a-zA-Z0-9]*www[a-zA-Z0-9]*com[a-zA-Z0-9]*', '', text)  
    text = re.sub(r'[^a-zA-Z]', ' ', text)   
    text = [token for token in text.split() if len(token) > 2]
    text = ' '.join(text)
    #text = emoji.demojize(text)
    
    return text

X_train = X_train.apply(clean_text)
X_test =X_test.apply(clean_text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, 
                             min_df=3, max_features=None, binary=False, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

In [None]:
# Fit the model
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [None]:
# Print the shape of the train and test
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

In [None]:
# Build a naive_bayes model
from sklearn.metrics import f1_score, accuracy_score,confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB().fit(X_train_tfidf, Y_train)
pred_test = nb_clf.predict(X_test_tfidf)

# Print the mertics
print('f1_score       :', f1_score(Y_test, pred_test, average='macro'))
print('accuracy score :', accuracy_score(Y_test, pred_test))

In [None]:
# Print the classification reports
print(classification_report(Y_test, pred_test))

In [None]:
# Import the libraries
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

# Build the model
sgd = SGDClassifier(loss='log', max_iter=200, random_state=0, class_weight='balanced')
ovr = OneVsRestClassifier(sgd)
ovr.fit(X_train_tfidf, Y_train)
y_pred_class = ovr.predict(X_test_tfidf)

# Print the metrics score
print('f1_score       :', f1_score(Y_test, y_pred_class, average='macro'))
print('accuracy score :', accuracy_score(Y_test, y_pred_class))

In [None]:
# Print the classification report
print(classification_report(Y_test, y_pred_class)) 

In [None]:
# Print the confusion matrix
confusion_matrix(Y_test,y_pred_class)

In [None]:
# Validation DataFrame
validation = pd.DataFrame({'stocktwit_tweet':X_test,
                           'predicted_sentiment':''})

In [None]:
validation['predicted_sentiment']=y_pred_class
validation['actual_sentiment']=Y_test

## Comparision of the predicted and actual sentiment of test data.

In [None]:
validation.head(10)

In [None]:
validation.shape

In [None]:
validation['predicted_sentiment'].value_counts()

In [None]:

validation['actual_sentiment'].value_counts()

In [None]:
tweets1.head()

In [None]:
# Preprocessing the test text
X_test_data = tweets1['stocktwit_tweet'].apply(clean_text)
X_test_data_tfidf = tfidf_vect.transform(X_test_data )
print(X_test_data_tfidf.shape)

y_pred_class_data = ovr.predict(X_test_data_tfidf)
tweets1['sentiment_score'] = y_pred_class_data

In [None]:
tweets1.head()