**Load data**

In [3]:
import pandas as pd
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_file.csv')

**Handle missing value**

In [7]:
print(train_df.dtypes)
print(test_df.dtypes)

Loan_ID               object
Gender                 int64
Married                int64
Dependents            object
Education              int64
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
Loan_Status           object
dtype: object
Loan_ID               object
Gender                 int64
Married                int64
Dependents            object
Education              int64
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome      int64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
dtype: object


In [14]:
train_df.fillna(train_df.select_dtypes(include='number').mean(), inplace=True)
test_df.fillna(test_df.select_dtypes(include='number').mean(), inplace=True)

In [15]:
train_df.fillna(train_df.select_dtypes(include=['float64', 'int64']).mean(), inplace=True)
test_df.fillna(test_df.select_dtypes(include=['float64', 'int64']).mean(), inplace=True)

**Encode categorical variable**

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
for col in categorical_columns:
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

**Feature Engineering**

In [19]:
train_df['Total_Income'] = train_df['ApplicantIncome'] + train_df['CoapplicantIncome']
test_df['Total_Income'] = test_df['ApplicantIncome'] + test_df['CoapplicantIncome']

**Split Training Data**

In [20]:
from sklearn.model_selection import train_test_split
X = train_df.drop(columns=['Loan_ID', 'Loan_Status'])
y = train_df['Loan_Status']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

**Model Training**

In [44]:
print(X_train.head())
print(y_train.head())

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
X_train['Dependents'] = encoder.fit_transform(X_train['Dependents'])

X_train['Dependents'] = X_train['Dependents'].replace('3+', 3)

X_train = X_train.apply(pd.to_numeric, errors='coerce')
y_train = pd.to_numeric(y_train, errors='coerce')

print(X_train.isnull().sum())
print(y_train.isnull().sum())


Empty DataFrame
Columns: [Gender, Married, Dependents, Education, Self_Employed, ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term, Credit_History, Property_Area, Total_Income]
Index: []
Series([], Name: Loan_Status, dtype: float64)
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Total_Income         0
dtype: int64
0


In [77]:

if hasattr(model, 'fit') and hasattr(model, 'predict'):
    print("Model is properly trained!")
else:
    print("Model might not be trained!")

sample = test_df[:5]  
predictions = model.predict(sample)
print(predictions)



Model is properly trained!


AttributeError: 'RandomForestClassifier' object has no attribute 'estimators_'

**Make Predictions**

In [75]:
print(test_df.head())
print(test_df.columns)


    Loan_ID  Gender  Married Dependents  Education  Self_Employed  \
0  LP001015       1        1          0          0              0   
1  LP001022       1        1          1          0              0   
2  LP001031       1        1          2          0              0   
3  LP001035       1        1          2          0              0   
4  LP001051       1        0          0          1              0   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5720                  0       110.0             360.0   
1             3076               1500       126.0             360.0   
2             5000               1800       208.0             360.0   
3             2340               2546       100.0             360.0   
4             3276                  0        78.0             360.0   

   Credit_History  Property_Area  Total_Income  
0        1.000000              2          5720  
1        1.000000              2          4576  
2        1.

In [78]:
print(train_df.dtypes)

numeric_columns = train_df.select_dtypes(include=['float64', 'int64']).columns
train_numeric = train_df[numeric_columns]

imputer = SimpleImputer(strategy="mean")
imputer.fit(train_numeric)
train_numeric_imputed = imputer.transform(train_numeric)

categorical_columns = train_df.select_dtypes(include=['object']).columns
imputer_categorical = SimpleImputer(strategy="most_frequent")
train_categorical = train_df[categorical_columns]
train_categorical_imputed = imputer_categorical.fit_transform(train_categorical)

train_df_imputed = pd.concat([
    pd.DataFrame(train_numeric_imputed, columns=numeric_columns),
    pd.DataFrame(train_categorical_imputed, columns=categorical_columns)
], axis=1)

Loan_ID               object
Gender                 int64
Married                int64
Dependents            object
Education              int64
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
Loan_Status           object
Total_Income         float64
dtype: object


In [103]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")  # You can change this to "median" or "most_frequent"
X_train = imputer.fit_transform(X_train)

                           

from sklearn.ensemble import HistGradientBoostingClassifier

model = HistGradientBoostingClassifier()
model.fit(X_train, y_train)


from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")  # Fill missing values
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Show predictions
print("Predictions:", y_pred)


Model Accuracy: 0.81
Predictions: ['Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'N' 'N' 'N' 'Y' 'N' 'Y'
 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'N' 'N' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y'
 'Y' 'Y' 'N' 'Y' 'Y' 'N' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y'
 'Y' 'N' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'N'
 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y']


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [106]:
train_df_features = train_df.drop(columns=["Loan_ID", "Loan_Status"]) 
train_df_labels = train_df["Loan_Status"] 

import pandas as pd
train_df = pd.read_csv("train_data.csv")  

print(train_df.columns)

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')


In [109]:
print(train_df_features.columns)


print([col for col in train_df_features.columns])


Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')
['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area']


In [110]:

train_df_features = train_df_features.ffill()


train_df_features = train_df_features.bfill()



In [111]:
print(train_df_features['Dependents'].unique())

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
train_df_features['Dependents'] = encoder.fit_transform(train_df_features['Dependents'])

train_df_features['Dependents'] = train_df_features['Dependents'].replace('3+', 3)
train_df_features['Dependents'] = pd.to_numeric(train_df_features['Dependents'], errors='coerce')
train_df_features = train_df_features.fillna(method="ffill")


['0' '1' '2' '3+']


  train_df_features = train_df_features.fillna(method="ffill")


In [112]:
print(train_df_features.columns)
print(train_df_features['Dependents'].unique())

print([col for col in train_df_features.columns])


Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')
[0 1 2 3]
['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area']


**Prepare Submission File**

In [104]:

import pandas as pd

# Example: Assuming 'data' is your original DataFrame
data['Loan_Status'] = 'N/A'  # Add the column with default value (or populate as needed)

# Save the DataFrame to a new file
data.to_csv('submission_file.csv', index=False)

print("Submission file created successfully!")


Submission file created successfully!
