In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

# Load cleaned dataset
df = pd.read_csv("/content/cleaned_data.csv")

# Convert last_funding_date to datetime
df['last_funding_date'] = pd.to_datetime(df['last_funding_date'])

# Encode categorical columns
for col in ['industry', 'hiring_roles']:
    df[col] = LabelEncoder().fit_transform(df[col])

# Define features and target
X = df.drop(columns=['company_id', 'company_name', 'last_funding_date', 'is_hot_lead'])
y = df['is_hot_lead']

# Fix infinite values and missing values
X = X.replace([np.inf, -np.inf], np.nan)  # Convert inf to NaN
X = X.fillna(X.median())  # Fill NaNs with median values

# Split dataset into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions & evaluate
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print(f"F1-Score: {f1:.4f}")

# Save submission file
submission = pd.DataFrame({'company_id': df.iloc[X_test.index]['company_id'], 'is_hot_lead': y_pred})
submission.to_csv("submission.csv", index=False)
print("Submission file saved as 'submission.csv'")


  df['last_funding_date'] = pd.to_datetime(df['last_funding_date'])


F1-Score: 0.8509
Submission file saved as 'submission.csv'
