In [4]:
# Handelling Missing Data in Datasets

import pandas as pd
import numpy as np

data = {
    'Age' : [25, 30, np.nan, 40, 50],
    'Salary' : [50000, 60000, 70000, np.nan, 90000],
    'Country' : ['USA', 'Canada', 'Mexico', 'USA', np.nan]
}

df = pd.DataFrame(data)
print("Dataset with missing values:\n", df)

Dataset with missing values:
     Age   Salary Country
0  25.0  50000.0     USA
1  30.0  60000.0  Canada
2   NaN  70000.0  Mexico
3  40.0      NaN     USA
4  50.0  90000.0     NaN


In [5]:
df_dropped = df.dropna() # Function used to remove or drop the missing values
print("\nDataset after dropping missing values:\n", df_dropped)


Dataset after dropping missing values:
     Age   Salary Country
0  25.0  50000.0     USA
1  30.0  60000.0  Canada


In [10]:
# Statergy based filling the missed values

df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].median())
df['Country'] = df['Country'].fillna(df['Country'].mode()[0])

print("\nDataset after filling missing values:\n", df)


Dataset after filling missing values:
      Age   Salary Country
0  25.00  50000.0     USA
1  30.00  60000.0  Canada
2  36.25  70000.0  Mexico
3  40.00  65000.0     USA
4  50.00  90000.0     USA


In [14]:
# Encoding Categorical Variables

df_encoded = pd.get_dummies(df, columns=['Country'], drop_first=True)
print("\nOne-hot encoded dataset:\n", df_encoded)



One-hot encoded dataset:
      Age   Salary  Country_Mexico  Country_USA
0  25.00  50000.0           False         True
1  30.00  60000.0           False        False
2  36.25  70000.0            True        False
3  40.00  65000.0           False         True
4  50.00  90000.0           False         True


In [15]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_encoded[['Age', 'Salary']] = scaler.fit_transform(df_encoded[['Age', 'Salary']])

print("\nDataset after feature scaling:\n", df_encoded)




Dataset after feature scaling:
         Age    Salary  Country_Mexico  Country_USA
0 -1.310001 -1.281423           False         True
1 -0.727778 -0.527645           False        False
2  0.000000  0.226134            True        False
3  0.436667 -0.150756           False         True
4  1.601112  1.733690           False         True


In [1]:
# TASK 1

# Handelling Missing Data in Datasets

import pandas as pd
import numpy as np

data = {
    'Age' : [25, 30, np.nan, 40, 50],
    'Salary' : [50000, 60000, 70000, np.nan, 90000],
    'Country' : ['USA', 'Canada', 'Mexico', 'USA', np.nan]
}

df = pd.DataFrame(data)
print("Dataset with missing values:\n", df)

Dataset with missing values:
     Age   Salary Country
0  25.0  50000.0     USA
1  30.0  60000.0  Canada
2   NaN  70000.0  Mexico
3  40.0      NaN     USA
4  50.0  90000.0     NaN


In [6]:
# TASK 2

from sklearn.preprocessing import MinMaxScaler

df['Age'] = df['Age'].fillna(df['Age'].mean())  # Fill Age with mean
df['Salary'] = df['Salary'].fillna(df['Salary'].median())  # Fill Salary with median
df['Country'] = df['Country'].fillna(df['Country'].mode()[0])  # Fill Country with mode

scaler = MinMaxScaler()
df_encoded = pd.get_dummies(df, columns=['Country'], drop_first=True)
print("\nOne-hot encoded dataset:\n", df_encoded)

df_encoded[['Age', 'Salary']] = scaler.fit_transform(df_encoded[['Age', 'Salary']])
print("\nDataset after feature scaling:\n", df_encoded)


One-hot encoded dataset:
      Age   Salary  Country_Mexico  Country_USA
0  25.00  50000.0           False         True
1  30.00  60000.0           False        False
2  36.25  70000.0            True        False
3  40.00  65000.0           False         True
4  50.00  90000.0           False         True

Dataset after feature scaling:
     Age  Salary  Country_Mexico  Country_USA
0  0.00   0.000           False         True
1  0.20   0.250           False        False
2  0.45   0.500            True        False
3  0.60   0.375           False         True
4  1.00   1.000           False         True


In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X = df_encoded.drop('Salary', axis=1)
y = df_encoded['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Coefficients: [ 0.97039474  0.06126645 -0.06126645]
Intercept: 0.002055921052631471
Mean Squared Error: 0.0029014524000173223
R-squared: nan


