In [11]:
#install necessary libraries
#!pip install pandas numpy scikit-learn


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score



#Load dataset (Assuming 'student_score.csv')
df = pd.read_csv("student_exam_scores.csv")

#display first five rows
print(df.head())


#check the missing values
print("\nMissing Values:\n", df.isnull().sum())


#creat a new feature:totel score
df['Total_Score'] = df[['Maths','Science', 'English']].sum(axis=1)


#creat a new feature:Average score
df['Average_Score'] = df['Total_Score'] /3
print("\nUpdated Dataset:\n", df.head())

print(df.columns)  

# If there is no 'FinalGrade'
if 'FinalGrade' not in df.columns:
    df['FinalGrade'] = df['Average_Score']

# Define features and target
X = df[['Maths', 'Science', 'English', 'Average_Score']]
y = df['FinalGrade']


#split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Traing samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")


#Train the model
model = LinearRegression()
model.fit(X_train, y_train)

#make prediction
y_pred = model.predict(X_test)

#Evaluate model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R^2 Score: {r2:2f}")

from sklearn.ensemble import RandomForestRegressor


#Difine model and parameter grid
model_rf = RandomForestRegressor(random_state=42)
param_grid = {'n_estimators': [50, 100,200], 'max_depth': [None, 10, 20]}

#perform grid search
grid_search = GridSearchCV(model_rf, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

#Best parameters
print("\nBest Parameters:", grid_search.best_params_)

#Evaluate best model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print(f"Optimized R^2 Score: {r2_score(y_test, y_pred_best):.2f}")




  student_id  hours_studied  sleep_hours  Maths  Science  English
0       S001            8.0          8.8   72.1       45     30.2
1       S002            1.3          8.6   60.7       55     25.0
2       S003            4.0          8.2   73.7       86     35.8
3       S004            3.5          4.8   95.1       66     34.0
4       S005            9.1          6.4   89.8       71     40.3

Missing Values:
 student_id       0
hours_studied    0
sleep_hours      0
Maths            0
Science          0
English          0
dtype: int64

Updated Dataset:
   student_id  hours_studied  sleep_hours  Maths  Science  English  \
0       S001            8.0          8.8   72.1       45     30.2   
1       S002            1.3          8.6   60.7       55     25.0   
2       S003            4.0          8.2   73.7       86     35.8   
3       S004            3.5          4.8   95.1       66     34.0   
4       S005            9.1          6.4   89.8       71     40.3   

   Total_Score  Average_S

In [16]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score


#Load dataset
df = pd.read_csv("fraud_detection.csv")

#Display first few rows
print(df.head())

#Check for missing values
print("\nMIssin Values:\n", df.isnull().sum())


from sklearn.preprocessing import LabelEncoder


#Encoder categorical variables
encoder = LabelEncoder()
df['type'] = encoder.fit_transform(df['type'])

print("\nEncoded Data:\n", df.head())


# Define features (X) and target (y)
df['amount'] = df['amount'].fillna(0)
df['type'] = df['type'].fillna(df['type'].mode()[0])
df['isFraud'] = df['isFraud'].fillna(0)

X = df[['amount', 'type']]
y = df['isFraud']  

# Split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")



from sklearn.tree import DecisionTreeClassifier


#Train decision tree model
model = DecisionTreeClassifier(random_state=42)

df = df.dropna(subset=['amount', 'type', 'isFraud'])
model.fit(X_train, y_train)

# make predictions
y_pred = model.predict(X_test)



from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0   1.0   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1   1.0   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2   1.0  TRANSFER    181.00  C1305486145          181.0            0.00   
3   1.0  CASH_OUT    181.00   C840083671          181.0            0.00   
4   1.0   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0      0.0             0.0  
1  M2044282225             0.0             0.0      0.0             0.0  
2   C553264065             0.0             0.0      1.0             0.0  
3    C38997010         21182.0             0.0      1.0             0.0  
4  M1230701703             0.0             0.0      0.0             0.0  

MIssin Values:
 step              104106
type              104106
amount            104106
nameOrig     