# Classification Exercise

In [205]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [206]:
train_dataset = pd.read_csv('https://raw.githubusercontent.com/WHPAN0108/BHT-DataScience-S23/main/classification/data/Assigment/aug_train.csv')

train_dataset

Unnamed: 0,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_type,last_new_job,training_hours,target
0,0.624,Male,No relevent experience,no_enrollment,High School,,5,,never,21,0
1,0.926,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,>4,12,0
2,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,Public Sector,>4,26,0
3,0.624,Male,No relevent experience,Full time course,High School,,1,,never,30,1
4,0.920,Female,Has relevent experience,no_enrollment,Masters,STEM,>20,,>4,46,0
...,...,...,...,...,...,...,...,...,...,...,...
2095,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,17,,4,4,0
2096,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,8,Funded Startup,2,10,1
2097,0.855,Male,No relevent experience,no_enrollment,High School,,<1,,never,37,0
2098,0.920,Male,No relevent experience,no_enrollment,Graduate,STEM,>20,,1,11,0


In [207]:
test_dataset = pd.read_csv("https://raw.githubusercontent.com/WHPAN0108/BHT-DataScience-S23/main/classification/data/Assigment/aug_test.csv")

test_dataset

Unnamed: 0,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_type,last_new_job,training_hours,target
0,0.624,,Has relevent experience,Full time course,Graduate,Other,3,Pvt Ltd,1,134,0
1,0.920,Female,No relevent experience,no_enrollment,Graduate,STEM,5,Early Stage Startup,1,34,1
2,0.767,,Has relevent experience,Full time course,Graduate,STEM,10,Pvt Ltd,2,90,0
3,0.910,Male,No relevent experience,,High School,,10,,never,42,0
4,0.624,Male,Has relevent experience,Part time course,Graduate,STEM,3,Pvt Ltd,1,198,0
...,...,...,...,...,...,...,...,...,...,...,...
95,0.698,Male,Has relevent experience,no_enrollment,Graduate,STEM,7,Pvt Ltd,never,139,0
96,0.926,Male,No relevent experience,no_enrollment,Masters,STEM,10,,2,45,1
97,0.920,Male,Has relevent experience,no_enrollment,Masters,STEM,7,,1,22,0
98,0.939,Male,No relevent experience,Full time course,High School,,7,,1,182,0


### Data clean, imputation

1. in experience, replace >20 to 21; <1 to 1, and convert this as a numerical column.

#### Train Data

In [208]:
train_dataset["experience"].isna().sum()

10

In [209]:
train_dataset['experience'] = train_dataset['experience'].replace({"<1": 1, ">20": 21}).astype(float)

In [210]:
train_dataset["experience"].isna().sum()

10

In [211]:
train_dataset["experience"].dtype

dtype('float64')

#### Test Data

In [212]:
test_dataset["experience"].isna().sum()

0

In [213]:
test_dataset['experience'] = test_dataset['experience'].replace({"<1": 1, ">20": 21}).astype(float)

In [214]:
test_dataset["experience"].dtype

dtype('float64')

2. in last_new_job, replace >4 to 5; never to 0, and convert this as a numerical column

#### Train Data

In [215]:
train_dataset["last_new_job"].isna().sum()

52

In [216]:
train_dataset['last_new_job'] = train_dataset['last_new_job'].replace({">4": 5, "never": 0}).astype(float)

In [217]:
train_dataset["last_new_job"].isna().sum()

52

In [218]:
train_dataset["last_new_job"].dtype

dtype('float64')

#### Test Data

In [219]:
test_dataset["last_new_job"].isna().sum()

0

In [220]:
test_dataset['last_new_job'] = test_dataset['last_new_job'].replace({">4": 5, "never": 0}).astype(float)

In [221]:
test_dataset["last_new_job"].dtype

dtype('float64')

3. If the column is categorical, impute the missing value as its mode. If the column is numerical, impute the missing value as its median

#### Train Data

In [222]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2100 entries, 0 to 2099
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city_development_index  2100 non-null   float64
 1   gender                  1585 non-null   object 
 2   relevent_experience     2100 non-null   object 
 3   enrolled_university     2051 non-null   object 
 4   education_level         2049 non-null   object 
 5   major_discipline        1768 non-null   object 
 6   experience              2090 non-null   float64
 7   company_type            1415 non-null   object 
 8   last_new_job            2048 non-null   float64
 9   training_hours          2100 non-null   int64  
 10  target                  2100 non-null   int64  
dtypes: float64(3), int64(2), object(6)
memory usage: 180.6+ KB


In [223]:
train_dataset.isna().sum()

city_development_index      0
gender                    515
relevent_experience         0
enrolled_university        49
education_level            51
major_discipline          332
experience                 10
company_type              685
last_new_job               52
training_hours              0
target                      0
dtype: int64

In [224]:
train_dataset["gender"] = train_dataset["gender"].fillna(train_dataset["gender"].mode()[0])
train_dataset["enrolled_university"] = train_dataset["enrolled_university"].fillna(train_dataset["enrolled_university"].mode()[0])
train_dataset["education_level"] = train_dataset["education_level"].fillna(train_dataset["education_level"].mode()[0])
train_dataset["major_discipline"] = train_dataset["major_discipline"].fillna(train_dataset["major_discipline"].mode()[0])
train_dataset["experience"] = train_dataset["experience"].fillna(train_dataset["experience"].median())
train_dataset["company_type"] = train_dataset["company_type"].fillna(train_dataset["company_type"].mode()[0])
train_dataset["last_new_job"] = train_dataset["last_new_job"].fillna(train_dataset["last_new_job"].median())

In [225]:
train_dataset.isna().sum()

city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64

#### Test Data

In [226]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city_development_index  100 non-null    float64
 1   gender                  72 non-null     object 
 2   relevent_experience     100 non-null    object 
 3   enrolled_university     96 non-null     object 
 4   education_level         98 non-null     object 
 5   major_discipline        88 non-null     object 
 6   experience              100 non-null    float64
 7   company_type            64 non-null     object 
 8   last_new_job            100 non-null    float64
 9   training_hours          100 non-null    int64  
 10  target                  100 non-null    int64  
dtypes: float64(3), int64(2), object(6)
memory usage: 8.7+ KB


In [227]:
test_dataset.isna().sum()

city_development_index     0
gender                    28
relevent_experience        0
enrolled_university        4
education_level            2
major_discipline          12
experience                 0
company_type              36
last_new_job               0
training_hours             0
target                     0
dtype: int64

In [228]:
test_dataset["gender"] = test_dataset["gender"].fillna(test_dataset["gender"].mode()[0])
test_dataset["enrolled_university"] = test_dataset["enrolled_university"].fillna(test_dataset["enrolled_university"].mode()[0])
test_dataset["education_level"] = test_dataset["education_level"].fillna(test_dataset["education_level"].mode()[0])
test_dataset["major_discipline"] = test_dataset["major_discipline"].fillna(test_dataset["major_discipline"].mode()[0])
test_dataset["company_type"] = test_dataset["company_type"].fillna(test_dataset["company_type"].mode()[0])

In [229]:
test_dataset.isna().sum()

city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64

### Classification

In [230]:
columns_to_encode = ["gender", "relevent_experience", "enrolled_university", "education_level", "major_discipline", "company_type"]

encoder = LabelEncoder()

for column in columns_to_encode:
    train_dataset[column] = encoder.fit_transform(train_dataset[column])
    test_dataset[column] = encoder.fit_transform(test_dataset[column])
    print(f"Spalte '{column}':")
    for original_category, encoded_value in zip(encoder.classes_, encoder.transform(encoder.classes_)):
        print(f"\t{original_category}: {encoded_value}")

Spalte 'gender':
	Female: 0
	Male: 1
	Other: 2
Spalte 'relevent_experience':
	Has relevent experience: 0
	No relevent experience: 1
Spalte 'enrolled_university':
	Full time course: 0
	Part time course: 1
	no_enrollment: 2
Spalte 'education_level':
	Graduate: 0
	High School: 1
	Masters: 2
Spalte 'major_discipline':
	Arts: 0
	Business Degree: 1
	Humanities: 2
	Other: 3
	STEM: 4
Spalte 'company_type':
	Early Stage Startup: 0
	Funded Startup: 1
	NGO: 2
	Public Sector: 3
	Pvt Ltd: 4


In [231]:
X_train = train_dataset.drop(columns=['target'])
y_train = train_dataset['target']

X_test = test_dataset.drop(columns=['target'])
y_test = test_dataset['target']

In [232]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

1. Build a classification model from the training set ( you can use any algorithms)

In [233]:
results = []
names_of_models = []

lg_parameters = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2', 'none'],  # Beachte: 'l1' benötigt solver='liblinear' oder 'saga'
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
    'max_iter': [100, 200, 300],
    'class_weight': [None, 'balanced']
    }

svc_parameter = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4],  # Nur für 'poly' relevant
    'coef0': [0.0, 0.1, 0.5],  # Für 'poly' und 'sigmoid' relevant
    'class_weight': [None, 'balanced']
}

knn_parameter = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40, 50],
    'p': [1, 2]
}

dtc_parameter = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 30, 40, 50],
    'min_impurity_decrease': [0.0, 0.01, 0.1]
}

rfc_parameter = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'oob_score': [True, False]
}


model_list = [('LR', LogisticRegression(random_state=1), lg_parameters),
              ('SVC', SVC(random_state=1), svc_parameter),
              ('KNN', KNeighborsClassifier(), knn_parameter),
              ('DTC', DecisionTreeClassifier(random_state=1), dtc_parameter),
              ('RFC', RandomForestClassifier(n_estimators=100, max_features=3,random_state=1), rfc_parameter)
            ]


for name, model, parameters_for_testing in model_list:
    kfold = KFold(n_splits=5)
    grid_cv = GridSearchCV(estimator=model, param_grid=parameters_for_testing, scoring='accuracy', cv=kfold)
    result = grid_cv.fit(scaled_X_train, y_train)

    print("{}: Best {} using {}".format(name, result.best_score_, result.best_params_))

600 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
600 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

LR: Best 0.7661904761904761 using {'C': 0.1, 'class_weight': None, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}


KeyboardInterrupt: 

2. generate the confusion matrix and calculate the accuracy, precision, recall, and F1-score on training set. 

3. Applying the model in the test set and generating the prediction

4. generate the confusion matrix from the test set and calculate the accuracy, precision, recall, and F1-score

5. compare the results between the training and test set

Extra point: think about what kind of the method can increase the performance (does not need to run )