In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
import matplotlib.pyplot as plt  
import seaborn as sn
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
# Importing the dataset 
data = pd.read_csv('../input/1000-companies/1000_Companies.csv')

## EDA AND FEATURE ENGINEERING

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe(include = 'all')

In [None]:
data.State.value_counts()

In [None]:
data.columns = ['RDSpend', 'Administration', 'MarketingSpend', 'State', 'Profit']

In [None]:
##PLOTS

In [None]:
## Lets Check the Profit vs Each attr

In [None]:
data.plot(x='RDSpend', y='Profit',style = 'o')  
plt.title('RDSpend vs Profit')  
plt.xlabel('RDSpend')  
plt.ylabel('Profit')  
plt.show()

In [None]:
##From the above graphical interpretation it can be seen that the Profit increases mostly with increase in RDSpend
##However when the RDSpend is 100000-150000 the Profit is highest

## Can be verified below :

In [None]:
x = data[(data.RDSpend < 100000)]
y = data[(data.RDSpend > 100000)]
print("Maximum Profit when RDSpend is below 100000 is :" , x.Profit.max())
print("Maximum Profit when RDSpend is above 100000 is :" , y.Profit.max())

In [None]:
x.shape

In [None]:
y.shape

In [None]:
## Also it can be seen that the dataset contains 60% data with RDSpend below 100000 and Profit less than 141585.52

In [None]:
data.plot(x='Administration', y='Profit', style='o')  
plt.title('Administration vs Profit')  
plt.xlabel('Administration')  
plt.ylabel('Profit')  
plt.show()

In [None]:
## Above Graph shows uneven data distribution

In [None]:
data.plot(x='MarketingSpend', y='Profit', style='or')  
plt.title('MarketingSpend vs Profit')  
plt.xlabel('MarketingSpend')  
plt.ylabel('Profit')  
plt.show()        

In [None]:
## It can be seen that the Profit is highest when  MarketingSpend is close to 300000

We can re-check from below :

In [None]:
z = data.Profit.max()
data.MarketingSpend[(data.Profit == z)]

In [None]:
##In which states do people have more profit

In [None]:
filter_state = pd.DataFrame(data.groupby(["State"])["Profit"].sum()).reset_index()
sn.barplot(y = 'Profit', x = 'State',data = filter_state, edgecolor = 'w')
plt.show()

In [None]:
##Profit is high for California then for New York and then for Florida
## Re-checking below

In [None]:
data.State.value_counts()

In [None]:
## Checking the distribution of categorical and continuous vars

In [None]:
## Individually checking the distribution for each var

In [None]:
## For continuos var we plot displot from seaborn library

In [None]:
sn.distplot(data.RDSpend,rug = True)

In [None]:
sn.distplot(data.Administration,rug = True)

In [None]:
sn.distplot(data.MarketingSpend,rug = True)

In [None]:
## For catgeorical we plot bar plot

In [None]:
data['State'].value_counts().plot(kind='bar')

In [None]:
 ##Let’s check the profit and once we plot it we can observe that the Average Profit is Between Nearly 100000 and 200000.

In [None]:
plt.figure(figsize=(15,10))
plt.tight_layout()
sn.distplot(data['Profit'])

In [None]:
data.Profit.mean()

## Feature Engineering


In [None]:
## Encoding 
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
data.State = le.fit_transform(data.State)

In [None]:
data.State = data.State.astype('category')

In [None]:
data.State.value_counts()

##Checking the missing values

In [None]:
data.isnull().sum().sum()

**## So no missing value in the dataset

*## We need to predict the Profit of the companies given the 4 params

In [None]:
data.corr()

**## From the above correlation matrix it can be seen that the independent vars R&D and Marketing Speed are highly corelated with val 0.978407 
*## It can lead to Multiplier effect which is against model building*

**## Heatmap for correlation

In [None]:
sn.heatmap(data.corr(), annot=True)

**## We will first build the Linear regression model with all the vars
**## Then we will build by removing correlated vars and then check which model is good

## Splitting the dataset

In [None]:
data_x = data.iloc[:,0:4]

In [None]:
data_x.columns

In [None]:
data_x.shape

In [None]:
data_y = data.iloc[:,4]

In [None]:
data_y

In [None]:
data_y.shape

In [None]:
data_x_train,data_x_test,data_y_train,data_y_test = train_test_split(data_x,data_y,test_size = 0.2,random_state = 101)

In [None]:
data_x_train.shape

In [None]:
data_y_train.shape

In [None]:
data_x_test.shape

In [None]:
data_y_test.shape

## Modelling
## Model 1

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(data_x_train,data_y_train) ## Training the algorithm

In [None]:
#To retrieve the intercept:
print(lr.intercept_)
#For retrieving the slope:
print(lr.coef_)

In [None]:
pred_val = lr.predict(data_x_test)

Now compare the actual output values for data_x_test with the predicted values, execute the following script:

In [None]:
compare = pd.DataFrame({'Actual': data_y_test, 'Predicted': pred_val})
compare


We can also visualize comparison result as a bar graph using the below script.

As the number of records is huge, for representation purpose I’m taking just 25 records.

In [None]:
df1 = compare.head(25)
df1.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

The predicted percentages are close to the actual ones.

Let’s find the values for metrics using our test data.

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(data_y_test, pred_val))  
print('Mean Squared Error:', metrics.mean_squared_error(data_y_test, pred_val))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(data_y_test, pred_val)))

###  Buliding Model 2 removing vars causing Multiplier effect

In [None]:
data.corr()

In [None]:
data1 = pd.read_csv('../input/1000-companies/1000_Companies.csv')

In [None]:
data1.head()

In [None]:
data2 = data1.iloc[:,[0,1,3,4]] ## Removing MarketingSpend

In [None]:
data2.head()

In [None]:
data2.State.value_counts()

In [None]:
data2.State = le.fit_transform(data2.State)

In [None]:
data2.State.value_counts()

In [None]:
data2.State.describe()

In [None]:
data2.State = data2.State.astype('category')

In [None]:
data2.State.describe()

In [None]:
data2.head()

In [None]:
data2_x = data2.iloc[:,0:3]

In [None]:
data2_x.head()

In [None]:
data2_y = data2.iloc[:,3]

In [None]:
data2_y.head()

In [None]:
data2_x_train,data2_x_test,data2_y_train,data2_y_test = train_test_split(data2_x,data2_y,test_size = 0.2, random_state = 101)

In [None]:
data2_x_train.shape

In [None]:
data2_y_train.shape

In [None]:
data2_x_test.shape

In [None]:
data2_y_test.shape

In [None]:
lr1 = LinearRegression()

In [None]:
lr1.fit(data2_x_train,data2_y_train)

In [None]:
pred_val1 = lr1.predict(data2_x_test)

In [None]:
#To retrieve the intercept:
print(lr1.intercept_)
#For retrieving the slope:
print(lr1.coef_)

In [None]:
compare1 = pd.DataFrame({'Actual': data2_y_test, 'Predicted': pred_val1})
compare1

In [None]:
compare2 = compare1.head(25)

In [None]:
compare2.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(data2_y_test, pred_val1))  
print('Mean Squared Error:', metrics.mean_squared_error(data2_y_test, pred_val1))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(data2_y_test, pred_val1)))

*## Root Mean Square for 1st Model is Root Mean Squared Error: 13768.975083100055
*## And for 2nd model by removing one attr is 13965.295519778878

*## Lower the RMSE value better is the model so Model1 is better 