In [1]:
import os
import sys

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import pandas as pd
import numpy as np

In [2]:
sys.path += ['/content/drive/Shareddrive/filtered_data']

In [3]:
import os
import pickle
import stat
import time

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
%cd /content/drive/Shareddrives/cs145project/filtered_data/stocks
%cd ../..
%cd headlines/sentiment_analysis_financial_news

/content/drive/Shareddrives/cs145project/filtered_data/stocks
/content/drive/Shareddrives/cs145project
/content/drive/Shareddrives/cs145project/headlines/sentiment_analysis_financial_news


In [9]:
# Getting and processing the data 
path="all-data.csv"
sentiments=pd.read_csv(path,encoding = "ISO-8859-1")
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
sentiments[sentiments.columns[1]] = sentiments[sentiments.columns[1]].apply(clean_text)
sentiments.head()

Unnamed: 0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
0,neutral,technopolis plans develop stages area less 100...
1,negative,international electronic industry company elco...
2,positive,new production plant company would increase ca...
3,positive,according company 's updated strategy years 20...
4,positive,financing aspocomp 's growth aspocomp aggressi...


In [10]:
#Splitting into training and testing sets
X=sentiments[sentiments.columns[1]]
y=sentiments[sentiments.columns[0]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
# Model building
model = make_pipeline(TfidfVectorizer(), SGDClassifier())
# Training the model with the training data
model.fit(X_train, y_train)

# Training accuracy
predicted_train = model.predict(X_train)
print(f"train accuracy: {accuracy_score(y_train, predicted_train)}")

# Predicting the test data categories
predicted_categories = model.predict(X_test)
print(f"test accuracy: {accuracy_score(y_test, predicted_categories)}")

train accuracy: 0.9780701754385965
test accuracy: 0.7543859649122807


In [12]:
# Change Directories Again
%cd /content/drive/Shareddrives/cs145project/filtered_data/

/content/drive/Shareddrives/cs145project/filtered_data


In [13]:
path="top-50-analyst-ratings-processed.csv"
headlines=pd.read_csv(path,encoding = "ISO-8859-1")
headlines['title']=headlines['title'].apply(clean_text)
headlines.head()

Unnamed: 0,title,date,stock
0,"piper sandler maintains overweight adobe, lowe...",2020-03-31 06:18:00-04:00,ADBE
1,shares several technology companies trading hi...,2020-03-30 10:23:00-04:00,ADBE
2,"shares several technology, semiconductor softw...",2020-03-27 11:30:00-04:00,ADBE
3,"cramer reveals stock favorites, says intuitive...",2020-03-27 10:10:00-04:00,ADBE
4,shares several software companies trading high...,2020-03-26 10:38:00-04:00,ADBE


In [14]:
input=headlines['title']
cats=model.predict(input)
headlines['sentiment']=cats

from collections import Counter
counts = Counter(headlines['sentiment'])
print("Sentiment Distribution:")
print("    Sentiment\tcount\t% of dataset")
for sentiment, count in counts.items():
  print(f"    {sentiment}\t{count}\t{((count/counts.total())*100):.3f}%")

headlines.head()

Sentiment Distribution:
    Sentiment	count	% of dataset
    neutral	88249	75.431%
    positive	20435	17.467%
    negative	8309	7.102%


Unnamed: 0,title,date,stock,sentiment
0,"piper sandler maintains overweight adobe, lowe...",2020-03-31 06:18:00-04:00,ADBE,neutral
1,shares several technology companies trading hi...,2020-03-30 10:23:00-04:00,ADBE,positive
2,"shares several technology, semiconductor softw...",2020-03-27 11:30:00-04:00,ADBE,positive
3,"cramer reveals stock favorites, says intuitive...",2020-03-27 10:10:00-04:00,ADBE,neutral
4,shares several software companies trading high...,2020-03-26 10:38:00-04:00,ADBE,neutral


In [15]:
headlines.to_pickle('SGD_Classified_Headlines.pkl')

In [17]:
headlines.loc[headlines['stock']=='ADBE']

Unnamed: 0,title,date,stock,sentiment
0,"piper sandler maintains overweight adobe, lowe...",2020-03-31 06:18:00-04:00,ADBE,neutral
1,shares several technology companies trading hi...,2020-03-30 10:23:00-04:00,ADBE,positive
2,"shares several technology, semiconductor softw...",2020-03-27 11:30:00-04:00,ADBE,positive
3,"cramer reveals stock favorites, says intuitive...",2020-03-27 10:10:00-04:00,ADBE,neutral
4,shares several software companies trading high...,2020-03-26 10:38:00-04:00,ADBE,neutral
...,...,...,...,...
1969,"adobe systems (adbe) acquire omniture, inc. (o...",2009-09-16 04:09:00-04:00,ADBE,neutral
1970,sale proceedings omniture inc (omtr) investiga...,2009-09-16 02:18:00-04:00,ADBE,neutral
1971,"fast money crew recommend ung, adbe, bdx yhoo",2009-09-15 19:01:00-04:00,ADBE,neutral
1972,(otcbb: blkl) software power vistacomm's retai...,2009-08-17 07:30:00-04:00,ADBE,neutral
