In [25]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Function to generate a random name
def generate_name():
    return fake.name()

# Generate dataset
num_entries = 200000
data = {
    'Name': [generate_name() for _ in range(num_entries)],
    'Age': [random.randint(22, 65) for _ in range(num_entries)],
    'Years_of_Experience': [random.randint(0, 30) for _ in range(num_entries)],
    'Current_Salary': [random.randint(30000, 150000) for _ in range(num_entries)]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save dataset to CSV (optional)
df.to_csv('hr_dataset.csv', index=False)

print(f"Dataset with {num_entries} entries generated.")


Dataset with 200000 entries generated.


In [27]:
import pandas as pd
import numpy as np

# Load dataset (assuming 'hr_dataset.csv' is your generated dataset)
df = pd.read_csv('hr_dataset.csv')

# Example of basic data cleaning and sanitization
# Replace any missing values with the mean (for numerical columns)
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Years_of_Experience'].fillna(df['Years_of_Experience'].mean(), inplace=True)
df['Current_Salary'].fillna(df['Current_Salary'].mean(), inplace=True)

# Remove duplicates if any
df.drop_duplicates(inplace=True)

# Check for outliers and handle them as needed
# For example, clipping Current Salary at a reasonable upper limit
upper_limit = np.percentile(df['Current_Salary'], 99)  # Adjust percentile as needed
df['Current_Salary'] = np.clip(df['Current_Salary'], 30000, upper_limit)

# Save cleaned dataset to CSV (if needed)
df.to_csv('cleaned_hr_dataset.csv', index=False)

print("Data cleaning and sanitization completed.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Years_of_Experience'].fillna(df['Years_of_Experience'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate o

Data cleaning and sanitization completed.


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming df is your DataFrame from the generated dataset

# Separate features and target
X = df[['Age', 'Years_of_Experience', 'Current_Salary']]
y = df['Current_Salary']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load cleaned dataset
df = pd.read_csv('cleaned_hr_dataset.csv')

# Separate features and target
X = df[['Age', 'Years_of_Experience', 'Current_Salary']]
y = df['Current_Salary']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regression': DecisionTreeRegressor(random_state=42),
    'Random Forest Regression': RandomForestRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R-squared': r2}

# Print results
for name, result in results.items():
    print(f'{name} Metrics:')
    print(f'Mean Squared Error: {result["MSE"]}')
    print(f'R-squared: {result["R-squared"]}')
    print()

# Select the best model based on metrics
best_model = min(results, key=lambda x: results[x]['MSE'])

print(f'Best Model: {best_model}')


Linear Regression Metrics:
Mean Squared Error: 4.3942666405234215e-21
R-squared: 1.0

Decision Tree Regression Metrics:
Mean Squared Error: 1.05355
R-squared: 0.9999999991141959

Random Forest Regression Metrics:
Mean Squared Error: 0.3904862875000236
R-squared: 0.9999999996716868

Best Model: Linear Regression


In [39]:
import joblib

# Assuming 'best_model' is your trained Linear Regression model
best_model = models['Linear Regression']  # Replace with your actual model object

# Save the model to a specific directory
joblib.dump(best_model, r'C:\Users\Dell\Downloads\linear_regression_model.pkl')

['C:\\Users\\Dell\\Downloads\\linear_regression_model.pkl']

In [41]:
pip install streamlit



