# Importing necessary modules

In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline


## Loading training data that contains already released movies that were scraped using tweepy api

In [62]:
train = pd.read_csv(r"normalized_test.csv")#, encoding = "latin1")
test = pd.read_csv(r"tweets_new.csv")
train.shape


(1969, 11)

In [63]:
train.head()

Unnamed: 0,tweetID,movies_movieName,dateTime,tweet,result,confidence,boxoffice,num_sentiment,movie_ID,starmeter,theatrecount
0,1.07111e+18,A Star is Born,12/7/2018 13:35,@ladygaga U deserve it so much,neg,1.0,194356626.0,0.0,1.0,182.0,3904.0
1,1.07111e+18,Venom,12/7/2018 13:35,I DIDNT KNOW VENOM WAS BACK IT LOOKS SO BEAUTI...,neg,0.8,212376051.0,0.0,10.0,80.0,4250.0
2,1.07111e+18,Robin Hood,12/7/2018 13:36,"Robinhood (the new one w/ Jamie Foxx), Aquaman...",neg,1.0,23300536.0,0.0,9.0,63.0,2827.0
3,1.07111e+18,Venom,12/7/2018 13:36,"deg God won't stop the snake from biting you, ...",neg,0.8,212376051.0,0.0,10.0,80.0,4250.0
4,1.07111e+18,Creed II,12/7/2018 13:36,Watched #Creed2 - highly recommend.,neg,0.6,85041763.0,0.0,3.0,67.0,3576.0


In [64]:
print("Number of missing values is",test.isna().sum().sum())

Number of missing values is 624


## Preprocessing input data by cleaning cells which has no values

In [65]:
train_clean = train.dropna()
train.head()
train_clean.shape


(1605, 11)

In [66]:
test_clean = test.dropna()
test.head()
test_clean.shape


(25560, 10)

In [67]:
print("Number of missing values is",train_clean.isna().sum().sum())

Number of missing values is 0


In [68]:
print("Number of missing values is",test_clean.isna().sum().sum())

Number of missing values is 0


## Calculating standard deviation, mean based on confidence and sentiment, which is used to convert them into one of the feature sets

In [88]:
m=train_clean.groupby('movie_ID')[['num_sentiment']].std()

In [89]:
a=test_clean.groupby('movie_ID')[['num_sentiment']].std()

In [90]:
n=train_clean.groupby('movie_ID')[['num_sentiment']].std()

In [91]:
b=test_clean.groupby('movie_ID')[['num_sentiment']].std()

In [92]:
o=train_clean.groupby('movie_ID')[['confidence']].std()

In [93]:
c=test_clean.groupby('movie_ID')[['confidence']].std()

In [94]:
p=train_clean.groupby('movie_ID')[['confidence']].mean()

In [95]:
d=test_clean.groupby('movie_ID')[['confidence']].mean()

## Merging the above dataframes

In [96]:
x=pd.merge(m, n, on='movie_ID')
y=pd.merge(o, p, on='movie_ID')
z=pd.merge(x, y, on='movie_ID')

z = z.rename(columns={'num_sentiment_x': 'std_dev_sent', 'num_sentiment_y': 'mean_sent', 'confidence_x': 'std_dev_conf', 'confidence_y': 'mean_conf'})
print(z)

          std_dev_sent  mean_sent  std_dev_conf  mean_conf
movie_ID                                                  
1.0           0.444880   0.444880      0.125356   0.853139
2.0           0.467099   0.467099      0.174773   0.836364
3.0           0.466694   0.466694      0.142489   0.896970
5.0           0.390021   0.390021      0.101575   0.907143
6.0           0.477567   0.477567      0.140175   0.866667
7.0           0.461057   0.461057      0.129362   0.856716
8.0           0.413701   0.413701      0.132039   0.873333
9.0           0.410391   0.410391      0.119649   0.920000
10.0          0.373646   0.373646      0.130322   0.878963
11.0          0.410414   0.410414      0.146592   0.875000


In [97]:
f=pd.merge(a, b, on='movie_ID')
g=pd.merge(c, d, on='movie_ID')
h=pd.merge(x, y, on='movie_ID')
h = h.rename(columns={'num_sentiment_x': 'std_dev_sent', 'num_sentiment_y': 'mean_sent', 'confidence_x': 'std_dev_conf', 'confidence_y': 'mean_conf'})
print(h)

          std_dev_sent  mean_sent  std_dev_conf  mean_conf
movie_ID                                                  
1.0           0.444880   0.444880      0.125356   0.853139
2.0           0.467099   0.467099      0.174773   0.836364
3.0           0.466694   0.466694      0.142489   0.896970
5.0           0.390021   0.390021      0.101575   0.907143
6.0           0.477567   0.477567      0.140175   0.866667
7.0           0.461057   0.461057      0.129362   0.856716
8.0           0.413701   0.413701      0.132039   0.873333
9.0           0.410391   0.410391      0.119649   0.920000
10.0          0.373646   0.373646      0.130322   0.878963
11.0          0.410414   0.410414      0.146592   0.875000


In [98]:
h1 = train_clean.drop(columns=["tweetID", "movies_movieName", "dateTime","tweet","result", "confidence","num_sentiment"])
h1.head



<bound method NDFrame.head of         boxoffice  movie_ID  starmeter  theatrecount
0     194356626.0       1.0      182.0        3904.0
1     212376051.0      10.0       80.0        4250.0
2      23300536.0       9.0       63.0        2827.0
3     212376051.0      10.0       80.0        4250.0
4      85041763.0       3.0       67.0        3576.0
5     194356626.0       1.0      182.0        3904.0
6      34638624.0       8.0      157.0        2393.0
7     194356626.0       1.0      182.0        3904.0
8     194356626.0       1.0      182.0        3904.0
9     194356626.0       1.0      182.0        3904.0
10     34638624.0       8.0      157.0        2393.0
11     34638624.0       8.0      157.0        2393.0
12    212376051.0      10.0       80.0        4250.0
13    212376051.0      10.0       80.0        4250.0
14    173958573.0       7.0      108.0        3865.0
15    137596591.0       6.0       54.0        4163.0
16     34638624.0       8.0      157.0        2393.0
17    194356626.

In [99]:
r1 = test_clean.drop(columns=["tweetID", "movies_movieName", "dateTime","tweet","result", "confidence","num_sentiment"])
r1.head



<bound method NDFrame.head of        theatrecount  starmeter  movie_ID
0            3000.0      621.0       3.0
1            3000.0      621.0       3.0
2            3000.0      621.0       3.0
3            3000.0      621.0       3.0
4            4160.0       53.0       6.0
5            3000.0      621.0       3.0
6            3000.0      621.0       3.0
7            4000.0      581.0       7.0
8            3000.0      621.0       3.0
9            3000.0      621.0       3.0
10           4125.0       36.0       2.0
11           3000.0      621.0       3.0
12           3000.0      621.0       3.0
13           3000.0      621.0       3.0
14           3000.0      621.0       3.0
15           3000.0      621.0       3.0
16           3000.0      621.0       3.0
17           3000.0      621.0       3.0
18           3000.0      621.0       3.0
19           4125.0       36.0       2.0
20           3000.0      621.0       3.0
21           4125.0       36.0       2.0
22           3000.0      62

In [100]:
result=pd.merge(h1, z, on='movie_ID')
result.head


<bound method NDFrame.head of         boxoffice  movie_ID  starmeter  theatrecount  std_dev_sent  mean_sent  \
0     194356626.0       1.0      182.0        3904.0      0.444880   0.444880   
1     194356626.0       1.0      182.0        3904.0      0.444880   0.444880   
2     194356626.0       1.0      182.0        3904.0      0.444880   0.444880   
3     194356626.0       1.0      182.0        3904.0      0.444880   0.444880   
4     194356626.0       1.0      182.0        3904.0      0.444880   0.444880   
5     194356626.0       1.0      182.0        3904.0      0.444880   0.444880   
6     194356626.0       1.0      182.0        3904.0      0.444880   0.444880   
7     194356626.0       1.0      182.0        3904.0      0.444880   0.444880   
8     194356626.0       1.0      182.0        3904.0      0.444880   0.444880   
9     194356626.0       1.0      182.0        3904.0      0.444880   0.444880   
10    194356626.0       1.0      182.0        3904.0      0.444880   0.444880  

In [101]:
result1=pd.merge(r1, h, on='movie_ID')
result1.head


<bound method NDFrame.head of       theatrecount  starmeter  movie_ID  std_dev_sent  mean_sent  \
0           3000.0      621.0       3.0      0.466694   0.466694   
1           3000.0      621.0       3.0      0.466694   0.466694   
2           3000.0      621.0       3.0      0.466694   0.466694   
3           3000.0      621.0       3.0      0.466694   0.466694   
4           3000.0      621.0       3.0      0.466694   0.466694   
5           3000.0      621.0       3.0      0.466694   0.466694   
6           3000.0      621.0       3.0      0.466694   0.466694   
7           3000.0      621.0       3.0      0.466694   0.466694   
8           3000.0      621.0       3.0      0.466694   0.466694   
9           3000.0      621.0       3.0      0.466694   0.466694   
10          3000.0      621.0       3.0      0.466694   0.466694   
11          3000.0      621.0       3.0      0.466694   0.466694   
12          3000.0      621.0       3.0      0.466694   0.466694   
13          3000.0

## Creating the samples and features to create the shape for multivariate regression

In [84]:
n,d = result.shape
print("Number of features:", d)
print("Number of samples:", n)

Number of features: 8
Number of samples: 1605


## Training with 8 features and y-axis as box office results to be predicted

In [85]:
Xtrain = result.iloc[:,1:8]
#Xtrain.head
ytrain = result.iloc[:,0]
ytrain.head

<bound method NDFrame.head of 0       194356626.0
1       194356626.0
2       194356626.0
3       194356626.0
4       194356626.0
5       194356626.0
6       194356626.0
7       194356626.0
8       194356626.0
9       194356626.0
10      194356626.0
11      194356626.0
12      194356626.0
13      194356626.0
14      194356626.0
15      194356626.0
16      194356626.0
17      194356626.0
18      194356626.0
19      194356626.0
20      194356626.0
21      194356626.0
22      194356626.0
23      194356626.0
24      194356626.0
25      194356626.0
26      194356626.0
27      194356626.0
28      194356626.0
29      194356626.0
           ...     
1575    166838160.0
1576    166838160.0
1577    166838160.0
1578    166838160.0
1579    166838160.0
1580    166838160.0
1581    166838160.0
1582    166838160.0
1583    166838160.0
1584    166838160.0
1585    166838160.0
1586    166838160.0
1587    166838160.0
1588    166838160.0
1589    166838160.0
1590    166838160.0
1591    166838160.0
1592    16

In [102]:
Xtest = result.iloc[:,1:8]
ytest = result.iloc[:,0]
Xval = result1

## Multivariate Regression and its outputs

In [111]:
regr1 = linear_model.LinearRegression()
regr1.fit(Xtrain, ytrain)

ypred = regr1.predict(Xtest)
yval = regr1.predict(Xval)
#print(Xval[1],yval)
print(yval)
print('Features selected: \n', regr1.coef_.nonzero()[0].size)
print("The MSE of the model is: %.2f" % mean_squared_error(ytest, ypred))
print('Variance score: %.2f' % r2_score(ytest, ypred))

[-1.50447347e+11 -1.50447347e+11 -1.50447347e+11 ... -2.11781121e+11
 -2.11781121e+11 -2.11781121e+11]
Features selected: 
 7
The MSE of the model is: 714375847700859.25
Variance score: 0.90
