In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
import time
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction
In this notebook i will be analysing a news article dataset containing 4 million articles covering 6000 stocks. First, lets see the data we have in this dataset. I will only go in depth on data that has not been covered in the other analysis.

In this notebook, i will also add the data for the labels. I have made a dataset with the 10 year data of the most occuring stocks.

In [None]:
data = pd.read_csv("../input/massive-stock-news-analysis-db-for-nlpbacktests/analyst_ratings_processed.csv",error_bad_lines=False)
data.sample(5).head(5)

#data = data.sample(1000)
histdata = pd.read_csv("../input/10y-historical-stock-data/stockhistory.csv")

data = data.dropna()
data.isna().sum()

In [None]:
def getUniqueness(dataset):
    for (columnName, columnData) in dataset.iteritems():
        print(f"unique values in [{columnName}]: {columnData.nunique()}")

    print(f"total rows: {len(dataset.index)}")
getUniqueness(data)

# Date
Let's split the date again. We can see that this dataset covers a much larger timeframe than the financial tweets dataset.

In [None]:
data.rename(columns={'date':'datetime'},inplace = True)
data[['date','time']] = data.datetime.str.split(expand=True)
data[['year','month','day']]= data.date.str.split('-',expand =True)
data['time'] = data.time.str[:-6]
data.sample(10).head(10)


In [None]:
def getUnixTime(row):
    dt = datetime.datetime(int(row["year"]), int(row["month"]),int(row["day"]))
    return time.mktime(dt.timetuple())
   
data["unix"] = data.apply(lambda row: getUnixTime(row),axis=1 )
data.sample(10).head()

In [None]:
getUniqueness(data)

In [None]:
data.title = data.title.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

In [None]:
import re
import nltk
import ssl
 

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
freq = pd.Series(' '.join(data['title']).lower().split()).value_counts()[:20]
freq
stop_words = set(stopwords.words("english"))
stop_words = stop_words.union(freq.index.tolist())
extra_words = ['amp']
stop_words = stop_words.union(extra_words)

In [None]:
corpus = []

def editText(textColumn):
    #Remove punctuations
   
    text = re.sub('[^a-zA-Z]', ' ', textColumn)
    
    #Convert to lowercase
    text = text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    text = text.replace("\n","")
    ##Convert to list from string
    text = text.split()
    
    ##Stemming
    ps=PorterStemmer()    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in  
            stop_words] 
    text = " ".join(text)
    corpus.append(text)
    string = "".join(word for word in text)
    return string

data["keywords"] = data.apply(lambda row: editText(row["title"]),axis=1 )


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re
cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=100000, ngram_range=(1,3))
X=cv.fit_transform(corpus)

with open("countVector.pkl", 'wb') as fout:
    pickle.dump(X, fout)
#list(cv.vocabulary_.keys())[:20]

In [None]:
keywordData = data
stockseries =histdata["stock"].value_counts().index.tolist()
#print(stockseries)
data = data[data["stock"].isin(stockseries)]
getUniqueness(data)

In [None]:

def getDataForDay(stock,timestamp,days):
    #t = time.process_time_ns() 
    timestamp = timestamp + (days* 86400)
    lower = timestamp - 43200
    higher = timestamp + 43200
    match = histdata.loc[(histdata['stock']==stock) & (histdata['timestamps']>lower)&(histdata['timestamps']<higher) ]
    #d = time.process_time_ns() - t
    #print(f'getDataforday time [{stock}]: {d}')
    return match

def getDelta(row):
    global matches
    
    current = getDataForDay(row["stock"],row["unix"],0)
    
    
    if not current.empty:
        
        future = getDataForDay(row["stock"],row["unix"],2)
        if not future.empty:
           
            diff =  future["close"].iloc[0]-current["close"].iloc[0]
            if diff >=0:
                return 1
            else:
                return 0
    return None

data = data.sample(120000)
data.set_index("unix")
data.dropna()
histdata.set_index("timestamps")
t = time.process_time() 
data["increased"]=data.apply(lambda row: getDelta(row),axis=1 )
d = time.process_time() - t
print(f'lambda time : {(d)/60/60}')
data.sample(5).head(5)
data['increased'].value_counts()

In [None]:
def get_top_n_words(corpus, n=None):
    bag_of_words = cv.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = []
    for word, idx in cv.vocabulary_.items():
        if  len(word.split()) ==1:
            words_freq.append((word, sum_words[0, idx]))
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(corpus, n=20)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]#Barplot of most freq words
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=30)

In [None]:
def get_top_n2_words(corpus, n=None):
    
    bag_of_words = cv.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq =[]
    for word, idx in cv.vocabulary_.items():
        if  len(word.split()) ==2:
            words_freq.append((word, sum_words[0, idx]))
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]

top2_words = get_top_n2_words(corpus, n=20)
top2_df = pd.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]
print(top2_df)#Barplot of most freq Bi-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
h.set_xticklabels(h.get_xticklabels(), rotation=45)

In [None]:
def get_top_n3_words(corpus, n=None):
    
    bag_of_words = cv.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = []
    for word, idx in cv.vocabulary_.items():
        if  len(word.split()) ==3:
            words_freq.append((word, sum_words[0, idx]))
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]

top3_words = get_top_n3_words(corpus, n=20)
top3_df = pd.DataFrame(top3_words)
top3_df.columns=["Tri-gram", "Freq"]
print(top3_df)#Barplot of most freq Tri-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
j.set_xticklabels(j.get_xticklabels(), rotation=45)

# Adding historical stock data

In [None]:
#data["delta1day"] = np.random.randint(0,2, size=len(data))
#data["delta7day"] = np.random.randint(0,2, size=len(data))
#data["delta30day"] = np.random.randint(0,2, size=len(data))
print(len(np.where(data.applymap(lambda x: x == ''))))
#data['delta1day'].replace('', np.nan, inplace=True)
#data.dropna(subset=['delta1day'], inplace=True)
data.head(10)

In [None]:
df1 = data[data["time"].isna()]
df1.head()
len(df1)

In [None]:
data.to_csv("processed_news.csv",index = False)
keywordData.to_csv("keyword_data.csv",index = False)
