In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import  r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

train_data = pd.read_csv("train_final.csv")
test_data = pd.read_csv("test_final.csv")

In [18]:
print(train_data.shape)
print(test_data.shape)

(25000, 15)
(23842, 15)


In [19]:
print(train_data.columns)
print(test_data.columns)

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income>50K'],
      dtype='object')
Index(['ID', 'age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country'],
      dtype='object')


In [20]:
# find out missing features between these two datasets
missing_features = []
for i in train_data.columns:
    count = sum(train_data[i]=='?')
    if count>0:
        missing_features.append(i)
print(missing_features)

['workclass', 'occupation', 'native.country']


In [22]:
# Separate features
numeric_features = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
categorical_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# Create preprocessing pipelines for numeric and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [23]:
# Let's use linear regression for prediction
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Convert target to numeric (assuming it's binary)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['income>50K'])

# Train the model
model.fit(train_data.drop('income>50K', axis=1), y_train)

In [25]:
# Make predictions on test data
# Keep the ID column separate before preprocessing
test_ids = test_data['ID'].copy()
test_predictions = model.predict(test_data.drop('ID', axis=1))
binary_predictions = (test_predictions >= 0.5).astype(int)

In [26]:
# Create submission dataframe
submission = pd.DataFrame({
    'ID': test_ids,
    'income>50K': label_encoder.inverse_transform(binary_predictions)
})

# Print some model evaluation metrics using cross-validation on training data
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, train_data.drop('income>50K', axis=1), y_train, cv=5)
print("\nCross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())


Cross-validation scores: [0.36568657 0.35916843 0.36453316 0.38204489 0.34844255]
Average CV score: 0.3639751228003168


In [27]:
# Save predictions to csv
submission.to_csv('income_predictions_linear_regression.csv', index=False)