In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

In [2]:
df = np.round(pd.read_csv('Data/50_Startups.csv')[['R&D Spend','Administration','Marketing Spend','Profit']]/10000)
np.random.seed(9)
df = df.sample(5)
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
21,8.0,15.0,30.0,11.0
37,4.0,5.0,20.0,9.0
2,15.0,10.0,41.0,19.0
14,12.0,16.0,26.0,13.0
44,2.0,15.0,3.0,7.0


In [3]:
df = df.iloc[:,0:-1]
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,4.0,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


In [6]:
df = df.copy()

# Now safely assign missing values using .iloc
df.iloc[1, 0] = np.nan
df.iloc[3, 1] = np.nan
df.iloc[-1, -1] = np.nan

In [7]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,


In [8]:
# Step 1 - Impute all missing values with mean of respective col

df0 = pd.DataFrame()

df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())

In [9]:
# 0th Iteration
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,9.25,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [11]:
# Create a copy of df0
df1 = df0.copy()

# Safely set the value in row 1, column 0 to NaN
df1.iloc[1, 0] = np.nan  # ✅ Use np.nan (lowercase 'n') instead of np.NaN

# View the result
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [12]:
# Use first 3 rows to build a model and use the last for prediction

X = df1.iloc[[0,2,3,4],1:3]
X

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,11.25,26.0
44,15.0,29.25


In [13]:
y = df1.iloc[[0,2,3,4],0]
y

21     8.0
2     15.0
14    12.0
44     2.0
Name: R&D Spend, dtype: float64

In [15]:
# Train model
lr = LinearRegression()
lr.fit(X, y)  # X must be a DataFrame with column names

# Prepare the input row for prediction (second row, excluding target column)
input_row = pd.DataFrame([df1.iloc[1, 1:].values], columns=X.columns)

# Predict using model
prediction = lr.predict(input_row)

In [16]:
df1.iloc[1,0] = 23.14

In [17]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [19]:
# Safely remove (set to NaN) the value at row index 3, column index 1
df1.iloc[3, 1] = np.nan

# Display the modified DataFrame
df1


Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,29.25


In [20]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[[0,1,2,4],[0,2]]
X

Unnamed: 0,R&D Spend,Marketing Spend
21,8.0,30.0
37,23.14,20.0
2,15.0,41.0
44,2.0,29.25


In [21]:
y = df1.iloc[[0,1,2,4],1]
y

21    15.0
37     5.0
2     10.0
44    15.0
Name: Administration, dtype: float64

In [23]:
# Fit the model (assuming X has column names like ['col1', 'col2'])
lr = LinearRegression()
lr.fit(X, y)

# Extract the column names used for training
feature_names = X.columns  # Should be something like ['col1', 'col3'] if you used cols 0 and 2

# Predict using matching feature names
input_row = pd.DataFrame([df1.iloc[3, [0, 2]].values], columns=feature_names)
prediction = lr.predict(input_row)

In [24]:
df1.iloc[3,1] = 11.06

In [25]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,29.25


In [27]:
# Remove (set to NaN) the value in the last column (col3) at row index 4
df1.iloc[4, -1] = np.nan

# Display the updated DataFrame
df1


Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,


In [28]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[0:4,0:2]
X

Unnamed: 0,R&D Spend,Administration
21,8.0,15.0
37,23.14,5.0
2,15.0,10.0
14,12.0,11.06


In [29]:
y = df1.iloc[0:4,-1]
y

21    30.0
37    20.0
2     41.0
14    26.0
Name: Marketing Spend, dtype: float64

In [31]:
# Get the feature names used during training
feature_names = X.columns  # This will be the names of the first two columns in X

# Prepare input row with correct shape and feature names
input_row = pd.DataFrame([df1.iloc[4, 0:2].values], columns=feature_names)

# Predict
prediction = lr.predict(input_row)

In [32]:
df1.iloc[4,-1] = 31.56

In [33]:
# After 1st Iteration
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [34]:
# Subtract 0th iteration from 1st iteration

df1 - df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,13.89,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-0.19,0.0
44,0.0,0.0,2.31


In [36]:
# Create a safe copy of df1
df2 = df1.copy()

# Remove the value at row index 1, column index 0 (set to NaN)
df2.iloc[1, 0] = np.nan

# Display the updated DataFrame
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [38]:
# Define features and target from selected rows (0,2,3,4)
X = df2.iloc[[0, 2, 3, 4], 1:3]  # Columns 1 and 2
y = df2.iloc[[0, 2, 3, 4], 0]    # Column 0

# Train the model
lr = LinearRegression()
lr.fit(X, y)

# Prepare the input row (row 1) with correct column names
input_row = pd.DataFrame([df2.iloc[1, 1:].values], columns=X.columns)

# Predict
prediction = lr.predict(input_row)

In [39]:
df2.iloc[1,0] = 23.78

In [41]:
# Set the value in row 3, column 1 to NaN
df2.iloc[3, 1] = np.nan

# Define X and y (rows with known col2 values)
X = df2.iloc[[0, 1, 2, 4], [0, 2]]  # Using columns 0 and 2 as features
y = df2.iloc[[0, 1, 2, 4], 1]       # Using column 1 (col2) as target

# Fit the model
lr = LinearRegression()
lr.fit(X, y)

# Prepare input row for prediction with column names
input_row = pd.DataFrame([df2.iloc[3, [0, 2]].values], columns=X.columns)

# Predict the missing value
prediction = lr.predict(input_row)

In [42]:
df2.iloc[3,1] = 11.22

In [44]:
# Set value in row 4, last column (col3) to NaN
df2.iloc[4, -1] = np.nan

# Define X and y (first 4 rows where col3 is known)
X = df2.iloc[0:4, 0:2]   # Columns 0 and 1 as features
y = df2.iloc[0:4, -1]    # Last column as target (col3)

# Train the model
lr = LinearRegression()
lr.fit(X, y)

# Prepare input row (row 4, columns 0 and 1) with column names
input_row = pd.DataFrame([df2.iloc[4, 0:2].values], columns=X.columns)

# Predict missing value
prediction = lr.predict(input_row)

In [45]:
df2.iloc[4,-1] = 31.56

In [46]:
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,31.56


In [47]:
df2 - df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.64,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.16,0.0
44,0.0,0.0,0.0


In [49]:
# Make a copy of df2 to preserve the original
df3 = df2.copy()

# Remove (set to NaN) the value at row index 1, column index 0
df3.iloc[1, 0] = np.nan

# Display the updated DataFrame
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,31.56


In [51]:
X = df3.iloc[[0, 2, 3, 4], 1:3]  # Features
y = df3.iloc[[0, 2, 3, 4], 0]    # Target

# Create and train the model
lr = LinearRegression()
lr.fit(X, y)

# Predict using a DataFrame (to keep column names and avoid warning)
X_new = df3.iloc[[1], 1:3]
y_pred = lr.predict(X_new)

In [52]:
df3.iloc[1,0] = 24.57

In [57]:
# Set NaN in the target column (column index 1) at row 3
df3.iloc[3, 1] = np.nan

# Prepare training data
X = df3.iloc[[0,1,2,4], [0,2]]
y = df3.iloc[[0,1,2,4], 1]

# Drop rows with NaN in features or target
valid_idx = ~X.isnull().any(axis=1) & ~y.isnull()
X = X[valid_idx]
y = y[valid_idx]

# Train model
lr = LinearRegression()
lr.fit(X, y)

# Prepare test data as DataFrame (preserve column names)
X_test = df3.iloc[[3], [0, 2]]

# Predict only if test input has no NaN
if X_test.isnull().any().any():
    print("Cannot predict: test input contains NaN values.")
else:
    y_pred = lr.predict(X_test)
    print("Prediction for row 3:", y_pred[0])

Prediction for row 3: 11.372828442565584


In [58]:
df3.iloc[3,1] = 11.37

In [60]:
# Set NaN in the last column (target) of row 4
df3.iloc[4, -1] = np.nan

# Prepare training data from rows 0 to 3
X = df3.iloc[0:4, 0:2]
y = df3.iloc[0:4, -1]

# Drop rows with NaN in X or y
valid_idx = ~X.isnull().any(axis=1) & ~y.isnull()
X = X[valid_idx]
y = y[valid_idx]

# Train the model
lr = LinearRegression()
lr.fit(X, y)

# Prepare test input (row 4, columns 0 and 1) as DataFrame to retain feature names
X_test = df3.iloc[[4], 0:2]

# Predict if no NaNs in input
if X_test.isnull().any().any():
    print("Cannot predict: test input contains NaN values.")
else:
    y_pred = lr.predict(X_test)
    print("Prediction for row 4:", y_pred[0])

Prediction for row 4: 45.539764168492304


In [61]:
df3.iloc[4,-1] = 45.53

In [62]:
df2.iloc[3,1] = 11.22

In [63]:
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,24.57,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.37,26.0
44,2.0,15.0,45.53


In [64]:
df3 - df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.79,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.15,0.0
44,0.0,0.0,13.97
