### Decorator

In [1]:
# let's create a decorator 
def my_decorator(func):
    def wrapper():
        print("Before calling the function") 
        func()
        print("After calling the function")
    return wrapper
    

In [3]:
@my_decorator
def say_hello(): 
    print("Hello")

In [4]:
say_hello()

Before calling the function
Hello
After calling the function


### Reading file with context manager 

In [7]:
# reading the entire file

with open("test.txt", "r") as f: 
    content = f.read()
print(content)

Hey hellow rushikesh 
today is the friday: and it is good 


In [8]:
#reading line by line 

with open("test.txt", "r") as f: 
    for line in f: 
        print(line)

Hey hellow rushikesh 

today is the friday: and it is good 


### Reading the lines to list 

In [10]:
with open("test.txt", "r") as f:
    lines = f.readlines()  # returns a list of lines
    print(lines)


['Hey hellow rushikesh \n', 'today is the friday: and it is good ']


### let's create a dataframe 

In [12]:
# importing the libarraies 
import pandas as pd 
import numpy as np 

np.random.seed(42)

In [31]:
n_rows = 10000
data = {
    "id": np.arange(1, n_rows + 1),
    "name": [f"User_{i}" for i in range(1, n_rows + 1)],
    "age": np.random.randint(18, 60, size=n_rows),
    "salary": np.random.randint(30000, 150000, size=n_rows),
    "department": np.random.choice(["IT", "HR", "Finance", "Marketing", "Sales"], size=n_rows),
    "join_date": pd.date_range(start="2010-01-01", periods=n_rows, freq="D")
}

In [32]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,id,name,age,salary,department,join_date
0,1,User_1,32,37837,IT,2010-01-01
1,2,User_2,38,53108,IT,2010-01-02
2,3,User_3,32,147736,HR,2010-01-03
3,4,User_4,40,47851,Finance,2010-01-04
4,5,User_5,19,56600,Sales,2010-01-05


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          10000 non-null  int64         
 1   name        10000 non-null  object        
 2   age         10000 non-null  int64         
 3   salary      10000 non-null  int64         
 4   department  10000 non-null  object        
 5   join_date   10000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 468.9+ KB


In [16]:
df.to_csv("sample_data.csv", header=False)

In [17]:
import pandas as pd 

def read_in_chunks(file_path, chunksize=100): 
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        yield chunk

In [22]:
gen= read_in_chunks(file_path = "sample_data.csv", 
               chunksize=100)

In [24]:
for i in gen: 
    print(i.info())
    break

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 100 to 199
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   0           100 non-null    int64 
 1   1           100 non-null    int64 
 2   User_1      100 non-null    object
 3   56          100 non-null    int64 
 4   70794       100 non-null    int64 
 5   Sales       100 non-null    object
 6   2010-01-01  100 non-null    object
dtypes: int64(4), object(3)
memory usage: 5.6+ KB
None


In [33]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score

In [34]:
df["experience"] = (pd.to_datetime("2025-01-01") - df["join_date"]).dt.days // 365


In [35]:
X = df[["age", "department", "experience"]]
y = df["salary"]

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [37]:
# Build a preprocessing + model pipeline 

In [38]:
numeric_features = ["age", "experience"]
categorial_features = ["department"]

In [40]:
preprocessor = ColumnTransformer(
    transformers = [
        ("num", StandardScaler(), numeric_features), 
        ("cat", OneHotEncoder(), categorial_features)
    ]
)

model = Pipeline(
    steps = [
        ("preprocessor", preprocessor), 
        ("regressor", LinearRegression())
    ]
)

In [41]:
model

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [42]:
# let's train a model 

model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [43]:
y_pred = model.predict(X_test)

print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Mean Squared Error: 1239435334.4151278
R² Score: -0.003157495378351749


In [44]:
new_user = pd.DataFrame({
    "age": [30],
    "department": ["IT"],
    "experience": [5]
})

pred_salary = model.predict(new_user)
print("Predicted Salary:", pred_salary[0])


Predicted Salary: 90106.19963766675


In [47]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


In [49]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(), categorial_features)
    ]
)

# --- Step 3: Define models ---
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

if False:
    models["XGBoost"] = XGBRegressor(n_estimators=100, random_state=42, verbosity=0)

# --- Step 4: Train & evaluate ---
results = {}

for name, regressor in models.items():
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {"MSE": mse, "R2": r2}
    print(f"{name} -> MSE: {mse:.2f}, R2: {r2:.4f}")

Linear Regression -> MSE: 1239435334.42, R2: -0.0032
Random Forest -> MSE: 1624637250.08, R2: -0.3149
Gradient Boosting -> MSE: 1238210404.73, R2: -0.0022
