In [5]:
# Section 1: Feature Engineering & Model Tuning 
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor  # Using regressor instead of classifier
from sklearn.metrics import r2_score
# Load dataset
file_path = r"C:\Users\hp\Downloads\student-mat.csv"  # Ensure this file is in the correct path

df = pd.read_csv(file_path)
print(df.head())
# Feature Engineering: Creating a Total_Score from G1, G2, G3
df["Total_Score"] = df[["G1", "G2", "G3"]].sum(axis=1)
# Define features and target
X = df.drop(columns=["G3"])  # 'G3' is the final grade, which we predict
y = df["G3"]

# Convert categorical variables using one-hot encoding
X = pd.get_dummies(X, drop_first=True)
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Model with Hyperparameter Tuning using GridSearchCV
param_grid = {
    "n_estimators": [50, 100, 150],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10]
}

rf = RandomForestRegressor(random_state=42)  # Using regressor for continuous target variable
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring="r2")
grid_search.fit(X_train, y_train)



  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  
0      4        3      4     1     1      3        6   5   6   6  
1      5        3      3     1     1      3        4   5   5   6  
2      4        3      2     2     3      3       10   7   8  10  
3      3        2      2     1     1      5        2  15  14  15  
4      4        3      2     1     2      5        4   6  10  10  

[5 rows x 33 columns]


In [6]:
# Best Model
best_model = grid_search.best_estimator_

# Predictions & R2 Score
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)  # R2 score

# Output the best parameters and model performance
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Model R² Score: {r2:.4f}")

Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Model R² Score: 0.9762


In [8]:
# Section 2: Fraud Detection with Decision Trees
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
# Load dataset
df = pd.read_csv(r'C:\Users\hp\Downloads\fraud1.csv')  # Ensure the correct file path
print(df.head())
# Check for missing values
df.dropna(inplace=True)

# Convert categorical variable 'type' using Label Encoding
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

# Feature Engineering: Creating a log-transformed transaction amount
df['Log_Amount'] = np.log1p(df['amount'])

# Define features and target
X = df.drop(columns=['step', 'nameOrig', 'nameDest', 'isFraud'])  # Exclude non-relevant columns
y = df['isFraud']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree Model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Predictions
y_pred = dt.predict(X_test)

# Model Evaluation
print(classification_report(y_test, y_pred))

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  
0  M1979787155             0.0             0.0        0  
1  M2044282225             0.0             0.0        0  
2   C553264065             0.0             0.0        1  
3    C38997010         21182.0             0.0        1  
4  M1230701703             0.0             0.0        0  
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       482
           1       0.33      0.33      0.33         3

    accuracy                         