In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from pandas.plotting import autocorrelation_plot
import datetime
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from nltk import tokenize
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score

In [2]:
filename2 = 'india-news-headlines.csv'
news_df  = pd.read_csv(filename2,parse_dates=[0])

In [3]:
news_df.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,2001-01-01,sports.wwe,win over cena satisfying but defeating underta...
1,2001-01-02,unknown,Status quo will not be disturbed at Ayodhya; s...
2,2001-01-02,unknown,Fissures in Hurriyat over Pak visit
3,2001-01-02,unknown,America's unwanted heading for India?
4,2001-01-02,unknown,For bigwigs; it is destination Goa


In [4]:
news_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3297172 entries, 0 to 3297171
Data columns (total 3 columns):
 #   Column             Dtype         
---  ------             -----         
 0   publish_date       datetime64[ns]
 1   headline_category  object        
 2   headline_text      object        
dtypes: datetime64[ns](1), object(2)
memory usage: 75.5+ MB


In [5]:
news_df.isnull().sum()

publish_date         0
headline_category    0
headline_text        0
dtype: int64

In [4]:
print("min date is ", news_df.publish_date.min())
print("max date is ", news_df.publish_date.max())

min date is  2001-01-01 00:00:00
max date is  2020-06-30 00:00:00


In [5]:
filename1 ='^BSESN.csv'

stock_df = pd.read_csv (filename1,parse_dates=[0])

In [6]:

stock_df.isnull().sum()
stock_df.dropna(inplace=True)

In [7]:
stock_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-10-03,38137.871094,38310.929688,37957.558594,38106.871094,38106.871094,59200.0
1,2019-10-04,38401.488281,38403.539063,37633.359375,37673.308594,37673.308594,52900.0
2,2019-10-07,37853.800781,37919.46875,37480.53125,37531.980469,37531.980469,40400.0
3,2019-10-09,37628.050781,38209.839844,37415.828125,38177.949219,38177.949219,36400.0
4,2019-10-10,38130.230469,38130.230469,37802.929688,37880.398438,37880.398438,27300.0


In [8]:
min_stock_date = stock_df.Date.min()
max_stock_date = stock_df.Date.max()
print("min date is ", stock_df.Date.min())
print("max date is ", stock_df.Date.max())

min date is  2019-10-03 00:00:00
max date is  2020-10-01 00:00:00


In [9]:
news_stock_df = news_df[(news_df['publish_date'] >= min_stock_date) & (news_df['publish_date'] <= max_stock_date)]


In [10]:
 news_df_combined = news_stock_df.groupby('publish_date')['headline_text'].apply(' '.join).reset_index()

In [11]:
news_df_combined['Date'] = news_df_combined['publish_date']
news_df_combined.drop(columns=['publish_date'],inplace=True)
news_df_combined.head()

Unnamed: 0,headline_text,Date
0,meet greet bond and seize the day networking s...,2019-10-03
1,I am in love with a relative Why no child welf...,2019-10-04
2,Channapatna artisans go tech-savvy to beat dro...,2019-10-05
3,Getting enough prebiotics? What is Wardrobing?...,2019-10-06
4,Wanna smell fresh all day? Apply perfume on th...,2019-10-07


In [12]:
news_stock_df = news_df_combined.merge(stock_df,how='inner',on='Date')

In [13]:
news_stock_df.head()


Unnamed: 0,headline_text,Date,Open,High,Low,Close,Adj Close,Volume
0,meet greet bond and seize the day networking s...,2019-10-03,38137.871094,38310.929688,37957.558594,38106.871094,38106.871094,59200.0
1,I am in love with a relative Why no child welf...,2019-10-04,38401.488281,38403.539063,37633.359375,37673.308594,37673.308594,52900.0
2,Wanna smell fresh all day? Apply perfume on th...,2019-10-07,37853.800781,37919.46875,37480.53125,37531.980469,37531.980469,40400.0
3,Assam: Hydel project pipeline bursts; 4 washed...,2019-10-09,37628.050781,38209.839844,37415.828125,38177.949219,38177.949219,36400.0
4,Ahmedabad: Dog set loose on cops; PASA- booked...,2019-10-10,38130.230469,38130.230469,37802.929688,37880.398438,37880.398438,27300.0


In [14]:
news_stock_df['headline_text'][0]

"meet greet bond and seize the day networking sites apps show you how Navratri Day 5: Colour today; Mata Skandamata mantra; puja; ghatasthapana and significance This 2-ingredient post workout drink is simple and effective The spotlight shines bright; say readers Metro extension on home stretch tops wishlist of Secunderabad residents Roshni Chopra replaces Monalisa in a game show Malayalis warm up to dandiya raas This bride wore a gorgeous silk lehenga in red colour for her wedding in London Sayantani Ghosh: I want to spend a whole day at Janpath shopping for jewellery Trend alert: Kareena Kapoor just wore the hottest jewellery trend Bengaluru plogs on Gandhi Jayanti in an effort to make the city plastic free Women power on stage at this do in Banaras Sonyaa Ayodhyay finds her soulmate; to tie the knot soon Micro Review: 'Those Days in Delhi' by Yashodhara Lal Bijal Joshi: I don't want to be typecast as a Gujarati character Women power on stage at this do in Banaras 'Tum Hi Aana' is the

In [15]:
# create the function to get subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

def getSIA(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    return sentiment
        

In [16]:
news_stock_df['subjectivity'] = news_stock_df['headline_text'].apply(getSubjectivity)
news_stock_df['polarity'] = news_stock_df['headline_text'].apply(getPolarity)
                                                                     

In [17]:
news_stock_df

Unnamed: 0,headline_text,Date,Open,High,Low,Close,Adj Close,Volume,subjectivity,polarity
0,meet greet bond and seize the day networking s...,2019-10-03,38137.871094,38310.929688,37957.558594,38106.871094,38106.871094,59200.0,0.429271,0.035780
1,I am in love with a relative Why no child welf...,2019-10-04,38401.488281,38403.539063,37633.359375,37673.308594,37673.308594,52900.0,0.418134,0.090559
2,Wanna smell fresh all day? Apply perfume on th...,2019-10-07,37853.800781,37919.468750,37480.531250,37531.980469,37531.980469,40400.0,0.427959,0.080569
3,Assam: Hydel project pipeline bursts; 4 washed...,2019-10-09,37628.050781,38209.839844,37415.828125,38177.949219,38177.949219,36400.0,0.429184,0.080979
4,Ahmedabad: Dog set loose on cops; PASA- booked...,2019-10-10,38130.230469,38130.230469,37802.929688,37880.398438,37880.398438,27300.0,0.404573,0.027153
...,...,...,...,...,...,...,...,...,...,...
175,I never thought I had a voice until today: Vid...,2020-06-24,35679.738281,35706.550781,34794.929688,34868.980469,34868.980469,26600.0,0.387802,0.088329
176,Truck firms look for new export markets to sel...,2020-06-25,34525.390625,35081.609375,34499.781250,34842.101563,34842.101563,24600.0,0.403998,0.027038
177,Containment zone residents slam high prices ch...,2020-06-26,35144.781250,35254.878906,34910.339844,35171.269531,35171.269531,24800.0,0.381841,0.068333
178,6 hot and stylish bikini looks of Katrina Kaif...,2020-06-29,34926.949219,35032.359375,34662.058594,34961.519531,34961.519531,18300.0,0.396828,0.061956


In [34]:
compound = []
neg = []
pos = []
neutral = []
SIA = 0

for i in range(0, len(news_stock_df['headline_text'])):
    
    SIA = getSIA(news_stock_df['headline_text'][i])
    compound.append(SIA['compound'])
    neg.append(SIA['neg'])
    pos.append(SIA['pos'])
    neutral.append(SIA['neu'])


In [35]:
news_stock_df['neg'] = neg


In [36]:
news_stock_df['pos'] = pos


In [38]:
news_stock_df['neutral'] = neutral

In [40]:
news_stock_df['compound'] = compound

In [41]:
news_stock_df.head()


Unnamed: 0,headline_text,Date,Open,High,Low,Close,Adj Close,Volume,subjectivity,polarity,neg,pos,neutral,compound
0,meet greet bond and seize the day networking s...,2019-10-03,38137.871094,38310.929688,37957.558594,38106.871094,38106.871094,59200.0,0.429271,0.03578,0.134,0.095,0.771,-0.9998
1,I am in love with a relative Why no child welf...,2019-10-04,38401.488281,38403.539063,37633.359375,37673.308594,37673.308594,52900.0,0.418134,0.090559,0.151,0.094,0.755,-0.9999
2,Wanna smell fresh all day? Apply perfume on th...,2019-10-07,37853.800781,37919.46875,37480.53125,37531.980469,37531.980469,40400.0,0.427959,0.080569,0.126,0.084,0.79,-0.9998
3,Assam: Hydel project pipeline bursts; 4 washed...,2019-10-09,37628.050781,38209.839844,37415.828125,38177.949219,38177.949219,36400.0,0.429184,0.080979,0.156,0.096,0.747,-0.9999
4,Ahmedabad: Dog set loose on cops; PASA- booked...,2019-10-10,38130.230469,38130.230469,37802.929688,37880.398438,37880.398438,27300.0,0.404573,0.027153,0.148,0.096,0.755,-0.9999


In [42]:
keep_columns = ['Open','High','Low','Close','Volume','subjectivity','polarity','neg','pos','neutral','compound']

In [43]:
df = news_stock_df[keep_columns]

In [68]:
df.isnull().sum()

Open            0
High            0
Low             0
Close           0
Volume          0
subjectivity    0
polarity        0
neg             0
pos             0
neutral         0
compound        0
dtype: int64

In [69]:
df.shape

(180, 11)

In [71]:
x = df.drop(columns=['Close'])
y = df['Close']
print(len(x),len(y),x.shape, y.shape)

180 180 (180, 10) (180,)


In [72]:
xtrain,xtest,ytrain,ytest= train_test_split(x,y,test_size=0.3, random_state=42)

In [73]:
print(len(xtrain), len(ytrain), len(xtest), len(ytest))

126 126 54 54


In [74]:
#implement linear regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain,ytrain)

LinearRegression()

In [75]:
ypred = model.predict(xtest)

In [76]:
len(ypred)

54

In [77]:
len(ytest)

54

In [78]:
#comparisions between predicted and actual values

test_result = pd.DataFrame({'Actual': ytest, 'Predicted': ypred})  

In [79]:
test_result

Unnamed: 0,Actual,Predicted
19,40301.960938,40358.31249
42,40445.148438,40565.802967
153,30672.589844,30672.116776
78,41198.660156,41334.994964
145,31371.119141,31011.503948
15,39831.839844,40063.398066
24,40345.078125,40271.937763
68,41872.730469,41741.875152
113,29915.960938,29997.704056
118,29815.589844,29931.389197


In [84]:

train_score  = model.score(xtrain,ytrain)
test_score = model.score(xtest, ytest)


print("accuracy score on train data set : ", round(train_score*100 , 2))
print("accuracy score on test data set : ", round(test_score* 100, 2))

accuracy score on train data set :  99.87
accuracy score on test data set :  99.73
