In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
# import our dataset
# NOTE: the variable 'df' represents 'data frame'
#full_path = "C:\\Users\\randel.bjorkquist\\Documents\\QuickStart\\DataScience\\Student_Performance.csv"
#df = pd.read_csv(full_path, parse_dates=['Test_Date'])

df = pd.read_csv('./data/admissions.csv', parse_dates=['application_date'], dtype={'approved': 'boolean'})
#df = pd.read_csv('./data/admissions.csv', dtype={'application_date': 'datetime64[ns]'}, dtype={'approved': 'boolean'})


In [None]:
df.head()

In [None]:
print(df.shape)
print(df.columns)


In [None]:
print("Data Types:")
print(df.dtypes)

print("\nMissing values per column:")
print(df.isna().sum())

#NOTE: Check if there are any duplicated rows
print('\nNumber of duplicationed rows:', df.duplicated().sum())


In [None]:
print("describe")
print(df.describe())


In [None]:
print('Unique Values per Column')
print(df.nunique())

In [None]:
plt.hist(df['age'], bins=10)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()


In [None]:
plt.boxplot(df['income'], vert = False)
plt.show()

In [None]:
plt.hist(df['income'], bins=20)
plt.title('Income Distribution')

In [None]:
plt.hist(df['region'])
plt.title('Region Distribution')
plt.xlabel('Region')
plt.ylabel('Count')
plt.show()

In [None]:
plt.boxplot(df['age'])
plt.show()


In [None]:
sns.boxplot(x='region', y='income', data=df)
plt.title('Income Distribution by Region')
plt.show()

In [None]:
sns.catplot(x='region', y='age', hue='approved', kind='box', data=df)
plt.title('Age Distribution by Region and Approval Status') 
plt.show()

In [None]:
sns.catplot(x='region', y='credit_score', hue='approved', kind='bar', data=df)
plt.title('Average Credit Score by Region and Approval Status')
plt.show()

In [None]:
north = df[df.region == 'North']
south = df[df.region == 'South']
east = df[df.region == 'East']
west = df[df.region == 'West']

print('North Region:')
print(north.describe())

print('South Region:')
print(south.describe())

print('East Region:')
print(east.describe())

print('West Region:')
print(west.describe())

In [None]:
num_cols = ['age', 'income', 'credit_score', 'employed', 'approved']
num_df   = df[num_cols].copy()

corr = num_df.corr(method='pearson' )
print(corr.round(3))



In [None]:
x = df['income']
y = df['credit_score']

plt.scatter(x, y, c=df['approved']
   .map({True: 'green', False: 'red'}), alpha=0.75)

m,b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, color='blue', label='Trend Line')


plt.xlabel('Income')
plt.ylabel('Credit Score')
plt.title('Income vs Credit Score')
plt.show()



In [None]:
#Columns that we want to keep ...
columns_to_keep = ['age', 'income', 'credit_score', 'employed', 'region', 'approved']
cleaned_df = df[columns_to_keep].copy()

cleaned_df.head()
#cleaned_df.info()

In [None]:
#Missing values for numeric columns:
df['age']    = df['age'].fillna(df['age'].median())
df['income'] = df['income'].fillna(df['income'].median())
df['credit_score'] = df['credit_score'].fillna(df['credit_score'].median())


In [None]:
#NOTE: all values within a row must be identical to be considered a duplicate
#Drop rows with duplicate values 
print('\nNumber of duplicated rows:', df.duplicated().sum())
df = df.drop_duplicates()


In [None]:
#Create features for the dataframes
#df['age_squared'] = df['age'] ** 2
#df['income_log'] = np.log1p(df['income'])
#df['credit_score_bin'] = pd.cut(df['credit_score'], bins=[0, 600, 700, 800, 900], labels=[1, 2, 3, 4])
#df['is_employed'] = df['employed'].apply(lambda x: 1 if x == True else 0)
#df['application_month'] = df['application_date'].dt.month

# Create a new binary feature 'high_risk' based on credit score and income
#df['high_risk'] = (df['credit_score'] < 300).astype(int)
df['high_risk'] = ((df['credit_score'] < 300) & (df['income'] < 20000)).astype(int)


#df['high_risk'] = (df['credit_score'] < 300).astype(bool)
#df['high_risk'] = df['credit_score'].apply(lambda x: 1 if x < 600 else 0)


In [None]:
df.head()

In [None]:
X = df.drop(columns=['approved', 'application_date'])
y = df['approved']

print(X.shape, y.shape) # OUTPUTs: X.shape = (300, 6) Y.shape = (300, )


In [None]:
#Change the region column to one-hot encoding
#X = pd.get_dummies(X, columns=['region'], drop_first=True)
X = pd.get_dummies(X, columns=['region'])
X.head()


In [None]:
# We need training data ... split the data into training and testing sets: X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify=y)

print(X_train.shape)
print(X_test.shape)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

#NOTE: Training ...
#      X_train and show it the corresponding Y_train values ...
#      y_test is the actual values for the test data ...
#      y_pred is what the model predicts for the X_test data ...

y_pred   = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

precision = precision_score(y_test, y_pred)
recall    = recall_score(y_test, y_pred)
f1        = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LogisticRegression


In [None]:
#Scalre feature without mean
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)



In [None]:
y_pred = log_reg.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print("Logistic Regression Accuracy:", accuracy)


In [None]:
precision = precision_score(y_test, y_pred, zero_division = 0)
recall    = recall_score(y_test, y_pred)
f1        = f1_score(y_test, y_pred)

print("Precision:" ,precision)
print("Recall:"    ,recall)
print("F1 Score:"  ,f1)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

In [None]:
#Feature Engineering Techniques for Machine Learning Models
X.head()

In [None]:
from sklearn.feature_selection import mutual_info_classif, chi2, f_classif, VarianceThreshold
from sklearn.preprocessing     import MinMaxScaler # Scale features to [0, 1] range for chi-squared test

In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
#Information Gain Selection Method
mi_scores = mutual_info_classif(X, y, random_state = 42)
mi_series = pd.Series(mi_scores, index = X.columns).sort_values(ascending = False)

print("Top 3 Features by information Gain:")
print(mi_series.head(3))

print()

print("All Series information Gain:")
print(mi_series)


In [None]:
#chi2_values, chi2_pvalues = chi2(X_scaled, y)

#NOTE: We only need the chi2_values for feature ranking and thus we use _ for the p-values, which means we are ignoring them.
chi2_values, _ = chi2(X_scaled, y)
chi2_series = pd.Series(chi2_values, index = X.columns).sort_values(ascending = False)

print("Top 3 Features by Chi-Squared Test:")
print(chi2_series.head(3))

print()

print("All Series Chi-Squared Test:")
print(chi2_series)


In [None]:
#Fisher Score Selection Method
f_values, _ = f_classif(X, y)
f_series = pd.Series(f_values, index = X.columns).sort_values(ascending = False)

print("Top 3 Features by Fisher Score:")
print(f_series.head(3))

print()

print("All Series Fisher Score:")
print(f_series)


In [None]:
#Variance Thresholding
selector = VarianceThreshold(threshold = 0.0) # Set threshold as needed
X_reduced = selector.fit_transform(X)

print("Original number of features:" ,X.shape[1])
print(" Reduced number of features:" ,X_reduced.shape[1])

print()

#NOTE: not sure what the different, if there is any ...
#variances = selector.fit(X).variances_
#var_series = pd.Series(variances, index = X.columns).sort_values(ascending = False)
selector.fit(X)
var_series = pd.Series(selector.variances_, index = X.columns).sort_values(ascending = False)

print("Top 3 Features by Fisher Score:")
print(var_series.head(3))

print()

print("All Series Fisher Score:")
print(var_series)


In [None]:
#for threshold in [0.0, 0.01, 0.05]:
for threshold in [0.0, 0.01, 0.1]:
    selector = VarianceThreshold(threshold = threshold)
    X_reduced = selector.fit_transform(X)
    kept_features = X.columns[selector.get_support()]

    print(f"Threshold: {threshold:.2f} => Number of features: {X_reduced.shape[1]}")
    print(f"Kept features: {list(kept_features)}")

In [None]:
# Now variance test
#from sklearn.feature_selection import VarianceThreshold

selector0 = VarianceThreshold(threshold = 0.1).fit(X)
var0 = pd.Series(selector0.variances_, index=X.columns)

selector01 = VarianceThreshold(threshold = 100).fit(X)
var01 = pd.Series(selector01.variances_, index=X.columns)

print("Threshold = 0.1")
print(var0.head(3))

print("\nThreshold = 100")
print(var01.head(3))


In [None]:
#import pandas as pd
#import numpy as np

# Manual variance calculation
manual_variances = { }

for col in X.columns:
    values = X[col].values
    mean_val = np.mean(values)
    variance = np.mean((values - mean_val) ** 2)   # formula
    manual_variances[col] = variance

manual_var_series = pd.Series(manual_variances)

print("Manual variance calculation:")
print(manual_var_series.head(3))

In [None]:
#Correlation
corrs = { }

for col in X.columns:
  corr = np.corrcoef(X[col], y)[0, 1]
  corrs[col] = abs(corr)

corr_series = pd.Series(corrs).sort_values(ascending = False)

print("Top 3 Features by Correlation with Target:")
print(corr_series.head(3))


In [None]:
results = pd.DataFrame({ "Information Gain": mi_series
                        ,"Chi2": chi2_series
                        ,"Fisher": f_series
                        ,"Correlation": corr_series
                        ,"Variance": var_series })

print("Feature Selection Results:")
print(results)


In [None]:
from sklearn.utils import resample
print(df.columns.tolist())

#Separate the classes
df_majority = df[df['approved'] == False]
df_minority = df[df['approved'] == True]

print(df_majority.shape)
print(df_minority.shape)

# Upsample minority class
#df_minority_upsampled = resample(df_minority, 
#                                 replace=True,     # sample with replacement
#                                 n_samples=len(df_majority),    # to match majority class
#                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
#df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
#print(df_upsampled['approved'].value_counts())

In [None]:
#Resampling Techniques
df_minority_upsampled = resample( df_minority
                                 ,replace = True                # sample with replacement
                                 ,n_samples = len(df_majority)  # to match majority class
                                 ,random_state = 42 )           # reproducible results

df_minority_upsampled.shape

# Combine majority class with upsampled minority class
#df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
#print(df_upsampled['approved'].value_counts())

In [None]:
df_balanced = pd.concat([df_majority, df_minority_upsampled])
print(df_balanced['approved'].value_counts())

In [None]:
X_balanced = df_balanced.drop(columns=['approved', 'application_date', 'high_risk', 'region'])
y_balanced = df_balanced['approved']

print(X_balanced.shape, y_balanced.shape) # OUTPUTs: X.shape = (300, 6) Y.shape = (300, )
X_balanced.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size = 0.2, random_state = 42, stratify=y_balanced)
print(X_train.shape, X_test.shape)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)


In [None]:
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

print("Precision:", precision_score(y_test, y_pred, zero_division = 0))
print("Recall:", recall_score(y_test, y_pred, zero_division = 0))
print("F1:", f1_score(y_test, y_pred, zero_division = 0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



In [None]:
indices = (y_pred == True) & (y_test == False)
false_positives = X_test[indices].copy()
print(false_positives)


In [None]:
results = pd.DataFrame({'actual': y_test, 'predicted': y_pred}, index = X_test.index)
df_with_predictions = df_balanced.join(results, how='left')
false_positives = df_with_predictions[(df_with_predictions['actual'] == False) & (df_with_predictions['predicted'] == True)]
print(false_positives)

In [None]:
# Grok AI Code ....
#import numpy as np
#import pandas as pd

# Assuming y_test and y_pred are NumPy arrays or Pandas Series
# and X_test is a Pandas DataFrame containing the test records

# Convert to NumPy arrays if they are Pandas Series
y_test = np.array(y_test)
y_pred = np.array(y_pred)

# Find indices where actual is 0 and predicted is 1 (False Positives)
fp_indices = np.where((y_test == 0) & (y_pred == 1))[0]

# If X_test is a DataFrame, get the corresponding records
if isinstance(X_test, pd.DataFrame):
    fp_records = X_test.iloc[fp_indices]
else:
    # If X_test is a NumPy array
    fp_records = X_test[fp_indices]

# Print the records
print("False Positive Records (actual = 0, predicted = 1):")
print(fp_records)

# Optionally, if you want to see the corresponding y_test and y_pred values
fp_labels = pd.DataFrame({'y_test': y_test[fp_indices], 'y_pred': y_pred[fp_indices]})
print("\nLabels for False Positives:")
print(fp_labels)