In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score

train = pd.read_csv("./Data/train.csv")
test = pd.read_csv("./Data/test.csv")

print(f"Shape of Training set {train.shape}")
print(f"Shape of Testining set {test.shape}")

df = pd.concat([train, test], axis=0)

# Reset the index if needed
df = df.reset_index(drop=True)
df

Shape of Training set (2628, 9)
Shape of Testining set (1127, 8)


Unnamed: 0,UID,col_0,col_1,col_2,col_3,col_4,col_5,col_6,y
0,0,A 0,B0,C2,D1,100,E1,F2,237000.0
1,1,A1,B0,C11,D4,100,E4,F2,86193.0
2,2,A0,B0,C18,D0,0,E0,F2,169200.0
3,3,A2,B0,C11,D1,100,E1,F2,58000.0
4,4,A0,B0,C67,D1,0,E1,F2,235000.0
...,...,...,...,...,...,...,...,...,...
3750,3750,A0,B0,C4,D1,100,E1,,
3751,3751,A1,B0,C34,D1,0,E1,F2,
3752,3752,A2,B 0,C4,D1,0,E1,F2,
3753,3753,A0,B0,C2,D1,100,E1,F2,


In [18]:
for col in df.columns:
    print(df[col].value_counts())

0       1
2507    1
2495    1
2496    1
2497    1
       ..
1256    1
1257    1
1258    1
1259    1
3754    1
Name: UID, Length: 3755, dtype: int64
A0        2160
A1         691
A2         275
A3          98
 A0         70
A 0         56
AO          50
A0          49
A 1         26
 A1         21
A1          20
 A2          9
A 2          9
A2           6
 A3          4
A 3          3
A   3        2
Name: col_0, dtype: int64
B0        3396
 B0         61
B 0         50
B0          48
BO          44
B3          16
B2           9
B1           9
B   3        1
B1           1
Name: col_1, dtype: int64
C11    1040
C2      840
C4      612
C9      289
C7      103
       ... 
C86       1
C81       1
C22       1
C48       1
C32       1
Name: col_2, Length: 93, dtype: int64
D1     3004
D4      167
D2       85
D0       80
D6       71
       ... 
D75       1
D66       1
D77       1
D38       1
D70       1
Name: col_3, Length: 78, dtype: int64
0      1923
100    1643
50      189
Name: col_4, dtype:

In [19]:
def preprocessing(df):
    UID = df['UID'] 
    ## Stripping all the spaces from the values
    cols = ['col_0', 'col_1', 'col_6']
    for col in cols:
        df[col] = df[col].str.strip()
    
    ## Manually changing the labels to proper form
    df['col_0'] = df['col_0'].replace({'A 0': 'A0', 'AO': 'A0', 'A 1':'A1', 'A 2':'A2', 'A 3':'A3', 'A   3':'A3', 'A 3': 'A3'})
    df['col_1'] = df['col_1'].replace({'B 0': 'B0', 'BO': 'B0', 'B   3': 'B3'})
    df['col_6'] = df['col_6'].replace({'F 2': 'F2', 'F 0': 'F0', 'FO': 'F0', 'F 1':'F1'})
    
    ## Getting top 10 categories from each of these classes
    top_10_col_2 = [x for x in df.col_2.value_counts().sort_values(ascending=False).head(7).index]
    top_10_col_3 = [x for x in df.col_3.value_counts().sort_values(ascending=False).head(10).index]
    top_10_col_5 = [x for x in df.col_5.value_counts().sort_values(ascending=False).head(10).index]

    def one_hot_top_x(df, variable, top_x_labels):
        for label in top_x_labels:
            df[variable+'_'+label] = np.where(df[variable]==label, 1, 0)
    
    ## Doing one-hot for 3 columns with high cardinality
    one_hot_top_x(df, 'col_2', top_10_col_2)
    one_hot_top_x(df, 'col_3', top_10_col_3)
    one_hot_top_x(df, 'col_5', top_10_col_5)
    
    ## Dropping columns after one hot
    df = df.drop(['col_2', 'col_3', 'col_5'], axis=1)

    label_encoder = LabelEncoder()

    # Fit and transform the column to label-encoded values
    # Doing label-encoding for ordinal values
    df['col_4'] = label_encoder.fit_transform(df['col_4'])
    
    ## Imputing Missing Values of column: col_0, col_1, col_6 using KNN
    # Create a copy of the DataFrame to avoid modifying the original data
    df_imputed = df.copy()
    df_imputed = df_imputed.drop(['y', 'UID'], axis=1)

    cols_to_impute = ['col_0', 'col_1', 'col_6']

    for col in cols_to_impute:
        # Assuming 'Category' is the column to impute and 'Other_Columns' are the features to use for prediction
        X = df_imputed.drop(cols_to_impute, axis=1)
        y = df_imputed[col]


        # Split the data into known and unknown categories
        X_known = X[~y.isna()]
        y_known = y[~y.isna()]
        X_unknown = X[y.isna()]


        # Fit a K-Nearest Neighbors classifier
        knn = KNeighborsClassifier(n_neighbors=5)
        knn.fit(X_known, y_known)

        # Predict missing values
        imputed_categories = knn.predict(X_unknown)
        #print(imputed_categories)
        # Update the original DataFrame with imputed values
        df_imputed.loc[y.isna(), col] = imputed_categories
        
    ## Encoding col_0, col_1, col_6
    
    def one_hot_encode_columns(data, columns_to_encode):
        """
        Perform one-hot encoding for specified columns in a DataFrame.

        Args:
        data (pd.DataFrame): The input DataFrame.
        columns_to_encode (list): A list of column names to be one-hot encoded.

        Returns:
        pd.DataFrame: The DataFrame with one-hot encoded columns.
        """
        # Create a copy of the original DataFrame to avoid modifying the original data
        encoded_data = data.copy()

        # Loop through each column to be one-hot encoded
        for column in columns_to_encode:
            # Perform one-hot encoding using pd.get_dummies
            one_hot = pd.get_dummies(encoded_data[column], prefix=column)

            # Concatenate the one-hot encoded columns with the original DataFrame
            encoded_data = pd.concat([encoded_data, one_hot], axis=1)

            # Drop the original column
            encoded_data.drop(column, axis=1, inplace=True)

        return encoded_data


    columns_to_encode = ['col_0', 'col_1', 'col_6']

    # Call the function
    df_encoded_imputed = one_hot_encode_columns(df_imputed, columns_to_encode)
    df_encoded_imputed['y'] = np.log(df['y'])
    df_encoded_imputed['UID'] = UID
    return df_encoded_imputed
    

In [20]:
df = preprocessing(df)
df

Unnamed: 0,col_4,col_2_C11,col_2_C2,col_2_C4,col_2_C9,col_2_C7,col_2_C15,col_2_C18,col_3_D1,col_3_D4,...,col_0_A3,col_1_B0,col_1_B1,col_1_B2,col_1_B3,col_6_F0,col_6_F1,col_6_F2,y,UID
0,2,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,12.375815,0
1,2,1,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,1,11.364344,1
2,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,12.038837,2
3,2,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,10.968198,3
4,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,12.367341,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3750,2,0,0,1,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,,3750
3751,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,,3751
3752,0,0,0,1,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,,3752
3753,2,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,,3753


In [21]:
df.isnull().sum()

col_4           0
col_2_C11       0
col_2_C2        0
col_2_C4        0
col_2_C9        0
col_2_C7        0
col_2_C15       0
col_2_C18       0
col_3_D1        0
col_3_D4        0
col_3_D2        0
col_3_D0        0
col_3_D6        0
col_3_D3        0
col_3_D12       0
col_3_D26       0
col_3_D8        0
col_3_D37       0
col_5_E1        0
col_5_E4        0
col_5_E2        0
col_5_E0        0
col_5_E6        0
col_5_E3        0
col_5_E11       0
col_5_E23       0
col_5_E33       0
col_5_E19       0
col_0_A0        0
col_0_A1        0
col_0_A2        0
col_0_A3        0
col_1_B0        0
col_1_B1        0
col_1_B2        0
col_1_B3        0
col_6_F0        0
col_6_F1        0
col_6_F2        0
y            1127
UID             0
dtype: int64

In [22]:
train_df = df.iloc[:2628]
test_df = df.iloc[2628:]
print(f"Shape of Training set {train_df.shape}")
print(f"Null values in Train {train_df.isnull().sum()}")
print(f"Shape of Testining set {test_df.shape}")
print(f"Null values in Test{test_df.isnull().sum()}")

Shape of Training set (2628, 9)
Null values in Train col_4        0
col_2_C11    0
col_2_C2     0
col_2_C4     0
col_2_C9     0
col_2_C7     0
col_2_C15    0
col_2_C18    0
col_3_D1     0
col_3_D4     0
col_3_D2     0
col_3_D0     0
col_3_D6     0
col_3_D3     0
col_3_D12    0
col_3_D26    0
col_3_D8     0
col_3_D37    0
col_5_E1     0
col_5_E4     0
col_5_E2     0
col_5_E0     0
col_5_E6     0
col_5_E3     0
col_5_E11    0
col_5_E23    0
col_5_E33    0
col_5_E19    0
col_0_A0     0
col_0_A1     0
col_0_A2     0
col_0_A3     0
col_1_B0     0
col_1_B1     0
col_1_B2     0
col_1_B3     0
col_6_F0     0
col_6_F1     0
col_6_F2     0
y            0
UID          0
dtype: int64
Shape of Testining set (1127, 8)
Null values in Testcol_4           0
col_2_C11       0
col_2_C2        0
col_2_C4        0
col_2_C9        0
col_2_C7        0
col_2_C15       0
col_2_C18       0
col_3_D1        0
col_3_D4        0
col_3_D2        0
col_3_D0        0
col_3_D6        0
col_3_D3        0
col_3_D12      

In [23]:
X = train_df.drop(['UID', 'y'], axis=1)
y = train_df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.005, random_state=42)

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV


# Standardize features
scaler = StandardScaler()
X_final = scaler.fit_transform(X_train)

def create_model(layers=1, neurons=32, activation='relu'):
    model = Sequential()
    for _ in range(layers):
        model.add(Dense(neurons, activation=activation))
    model.add(Dense(1))  # Output layer for regression
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Create a KerasRegressor for use with GridSearchCV
model = KerasRegressor(build_fn=create_model, verbose=0)

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'layers': [1, 2, 3, 4, 5, 6],         # Number of hidden layers
    'neurons': [16, 32, 64, 128, 256],     # Number of neurons in each hidden layer
    'activation': ['relu', 'tanh', 'sigmoid'],  # Activation functions
    'epochs': [50, 100, 150, 200],    # Number of training epochs
}

# Create a K-Fold cross-validator for hyperparameter tuning
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=kf)

# Perform hyperparameter tuning
grid_search.fit(X_final, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Save the best model to a file
joblib.dump(best_model, 'est_model_keras_regressor.pkl')

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Print RMSE scores for each fold during cross-validation
for i, (_, test_idx) in enumerate(kf.split(X_final)):
    X_test_fold = X_scaled[test_idx]
    y_test_fold = y_train.iloc[test_idx]
    
    y_pred_fold = best_model.predict(X_test_fold)
    rmse_fold = np.sqrt(mean_squared_error(y_test_fold, y_pred_fold))
    print(f"Fold {i+1} RMSE: {rmse_fold:.2f}")

  model = KerasRegressor(build_fn=create_model, verbose=0)


KeyboardInterrupt: 

In [None]:
# Initialize and train the models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=10.0, max_iter=10000000000, tol=1e-8),
    'Lasso': Lasso(alpha=10.0, max_iter=1000),
    'Decision Tree Regressor': DecisionTreeRegressor(max_depth= 10000, random_state=42),
    'Random Forest Regressor': RandomForestRegressor(max_depth= 10000, random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)
}

results = []

for model_name, model in models.items():
    model.fit(X_train, y_train)
    joblib.dump(model, f'{model_name}.pkl')
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))
    r2 = r2_score(np.exp(y_test), np.exp(y_pred))
    
    results.append({
        'Model': model_name,
        'Mean Squared Error': rmse,
        'R-squared': r2
    })

# Display the results
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=10.0, max_iter=10000, tol=1e-8),
    'Lasso': Lasso(alpha=10.0, max_iter=10000),
    'Decision Tree Regressor': DecisionTreeRegressor(max_depth= 10000, random_state=42),
    'Random Forest Regressor': RandomForestRegressor(max_depth= 10000, random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)
}

results = []

for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    #joblib.dump(model, f'{model_name}.pkl')
    y_pred = model.predict(X_test_scaled)
    
    rmse = np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))
    r2 = r2_score(np.exp(y_test), np.exp(y_pred))
    
    results.append({
        'Model': model_name,
        'Mean Squared Error': rmse,
        'R-squared': r2
    })

# Display the results
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
"""# Create a Gradient Boosting Regressor model
gb_regressor = GradientBoostingRegressor()

# Define a grid of hyperparameters to search over
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 1000],         # Number of boosting stages to be used
    'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2],     # Step size shrinkage used in update
    'max_depth': [3, 4, 5, 7, 10],                # Maximum depth of the individual trees
    'min_samples_split': [2, 3, 4, 5, 7, 10],        # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 3, 4, 5, 7, 10]          # Minimum samples required at each leaf node
}

# Create a GridSearchCV object with the model and parameter grid
grid_search = GridSearchCV(estimator=gb_regressor, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=5)

# Fit the GridSearchCV to your data to find the best parameters
grid_search.fit(X_train, y_train)  # Replace X_train and y_train with your training data

# Get the best hyperparameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

joblib.dump(best_model, f'gradientBoostingRegressorBest2.pkl')
y_pred = best_model.predict(X_test)
    
rmse = np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))
print(f"RMSE: {rmse}")
r2 = r2_score(np.exp(y_test), np.exp(y_pred))
print(f"r2: {r2}")"""

In [None]:
train_df['y'].mean()

In [None]:
test_df

In [None]:
# Load the saved model
loaded_model = joblib.load('gradientBoostingRegressorBest.pkl')  # Replace with the actual file path

# Prepare your input data for prediction
input_data = test_df.drop(['UID', 'y'], axis=1)  # Replace with your actual input data

# Make predictions using the loaded model
predictions = loaded_model.predict(input_data)
submission_df = pd.DataFrame({'UID': test_df['UID'], 'y': np.exp(predictions)})

In [None]:
submission_df.to_csv("submission2.csv", index=False)

In [None]:
submission_df.isnull().sum()