In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('mice imputation.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df= np.round(df)
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.0,136898.0,471784.0,New York,192262.0
1,162598.0,151378.0,443899.0,California,191792.0
2,153442.0,101146.0,407935.0,Florida,191050.0
3,144372.0,118672.0,383200.0,New York,182902.0
4,142107.0,91392.0,366168.0,Florida,166188.0


In [4]:
df= df[['R&D Spend','Administration','Marketing Spend','Profit']]/10000
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
0,16.5349,13.6898,47.1784,19.2262
1,16.2598,15.1378,44.3899,19.1792
2,15.3442,10.1146,40.7935,19.105
3,14.4372,11.8672,38.32,18.2902
4,14.2107,9.1392,36.6168,16.6188


In [5]:
df = df.iloc[:,0:-1]
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.5349,13.6898,47.1784
1,16.2598,15.1378,44.3899
2,15.3442,10.1146,40.7935
3,14.4372,11.8672,38.32
4,14.2107,9.1392,36.6168


In [6]:
np.random.seed(9)
df=df.sample(5)
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,7.8389,15.3773,29.9737
37,4.407,5.1283,19.7029
2,15.3442,10.1146,40.7935
14,11.9943,15.6547,25.6513
44,2.2178,15.4806,2.8335


In [7]:
df=np.round(df)
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,4.0,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


In [8]:
df.iloc[1,0] = np.NaN
df.iloc[3,1] = np.NaN
df.iloc[-1,-1] = np.NaN

In [9]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,


In [10]:
# Step 1 - Impute all missing values with mean of respective col

df0 = pd.DataFrame()

df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())

In [11]:
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,9.25,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [12]:
# Remove the col1 imputed value
df1 = df0.copy()

df1.iloc[1,0] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [13]:
# Use first 3 rows to build a model and use the last for prediction

X = df1.iloc[[0,2,3,4],1:3]
X

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,11.25,26.0
44,15.0,29.25


In [14]:
y = df1.iloc[[0,2,3,4],0]
y

21     8.0
2     15.0
14    12.0
44     2.0
Name: R&D Spend, dtype: float64

In [15]:
df1.iloc[1,1:]

Administration      5.0
Marketing Spend    20.0
Name: 37, dtype: float64

In [16]:
df1.iloc[1,1:].values

array([ 5., 20.])

In [17]:
df1.iloc[1,1:].values.reshape(1,2)

array([[ 5., 20.]])

In [18]:
df1.iloc[1,1:].values.reshape(2,1)

array([[ 5.],
       [20.]])

In [19]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[1,1:].values.reshape(1,2))

array([23.14158651])

In [20]:
df1.iloc[1,0] = 23.14

In [21]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [22]:
# Remove the col2 imputed value

df1.iloc[3,1] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,29.25


In [23]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[[0,1,2,4],[0,2]]
X

Unnamed: 0,R&D Spend,Marketing Spend
21,8.0,30.0
37,23.14,20.0
2,15.0,41.0
44,2.0,29.25


In [24]:
y = df1.iloc[[0,1,2,4],1]
y

21    15.0
37     5.0
2     10.0
44    15.0
Name: Administration, dtype: float64

In [25]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[3,[0,2]].values.reshape(1,2))

array([11.06331285])

In [26]:
df1.iloc[3,1] = 11.06

In [27]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,29.25


In [28]:
# Remove the col3 imputed value
df1.iloc[4,-1] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,


In [29]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[0:4,0:2]
X

Unnamed: 0,R&D Spend,Administration
21,8.0,15.0
37,23.14,5.0
2,15.0,10.0
14,12.0,11.06


In [30]:
y = df1.iloc[0:4,-1]
y

21    30.0
37    20.0
2     41.0
14    26.0
Name: Marketing Spend, dtype: float64

In [31]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[4,0:2].values.reshape(1,2))

array([31.56351448])

In [32]:
df1.iloc[4,-1] = 31.56

In [33]:
# After 1st Iteration
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [34]:
# Subtract 0th iteration from 1st iteration

df1 - df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,13.89,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-0.19,0.0
44,0.0,0.0,2.31


In [35]:
df2 = df1.copy()

df2.iloc[1,0] = np.NaN

df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [36]:
X = df2.iloc[[0,2,3,4],1:3]
y = df2.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[1,1:].values.reshape(1,2))

array([23.78627207])

In [37]:
df2.iloc[1,0] = 23.78

In [38]:
df2.iloc[3,1] = np.NaN
X = df2.iloc[[0,1,2,4],[0,2]]
y = df2.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[3,[0,2]].values.reshape(1,2))

array([11.22020174])

In [39]:
df2.iloc[3,1] = 11.22

In [40]:
df2.iloc[4,-1] = np.NaN

X = df2.iloc[0:4,0:2]
y = df2.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[4,0:2].values.reshape(1,2))

array([38.87979054])

In [41]:
df2.iloc[4,-1] = 31.56

In [42]:
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,31.56


In [43]:
df2 - df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.64,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.16,0.0
44,0.0,0.0,0.0


In [44]:
df3 = df2.copy()

df3.iloc[1,0] = np.NaN

df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,31.56


In [45]:
X = df3.iloc[[0,2,3,4],1:3]
y = df3.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[1,1:].values.reshape(1,2))

array([24.57698058])

In [46]:
df3.iloc[1,0] = 24.57

In [47]:
df3.iloc[3,1] = np.NaN
X = df3.iloc[[0,1,2,4],[0,2]]
y = df3.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[3,[0,2]].values.reshape(1,2))

array([11.37282844])

In [48]:
df3.iloc[3,1] = 11.37

In [49]:
df3.iloc[4,-1] = np.NaN

X = df3.iloc[0:4,0:2]
y = df3.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[4,0:2].values.reshape(1,2))

array([45.53976417])

In [50]:
df3.iloc[4,-1] = 45.53

In [51]:
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,24.57,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.37,26.0
44,2.0,15.0,45.53


In [52]:
df3 - df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.79,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.15,0.0
44,0.0,0.0,13.97


In [53]:
df4 = df3.copy()

df4.iloc[1,0] = np.NaN

df4

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.37,26.0
44,2.0,15.0,45.53


In [54]:
X = df4.iloc[[0,2,3,4],1:3]
y = df4.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df4.iloc[1,1:].values.reshape(1,2))

array([26.98977176])

In [55]:
df4.iloc[1,0] = 26.98

In [56]:
df4.iloc[3,1] = np.NaN
X = df4.iloc[[0,1,2,4],[0,2]]
y = df4.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df4.iloc[3,[0,2]].values.reshape(1,2))

array([12.74923997])

In [57]:
df4.iloc[3,1] = 12.74

In [58]:
df4.iloc[4,-1] = np.NaN

X = df4.iloc[0:4,0:2]
y = df4.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df4.iloc[4,0:2].values.reshape(1,2))

array([68.71345168])

In [59]:
df4.iloc[4,-1] = 68.71

In [60]:
df4

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,26.98,5.0,20.0
2,15.0,10.0,41.0
14,12.0,12.74,26.0
44,2.0,15.0,68.71


In [61]:
df4-df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,2.41,0.0,0.0
2,0.0,0.0,0.0
14,0.0,1.37,0.0
44,0.0,0.0,23.18
