**IMPORTING MODULES**




In [None]:
#add all import files here
# Import necessary modules for data analysis and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


# Set the style of seaborn for better visualization
sns.set(style="whitegrid")

# Ensure that plots are displayed in the notebook
%matplotlib inline




**READING CSV FILE**




In [None]:
#reading csv (DON'T ADD ANYTHING HERE)

url = "https://raw.githubusercontent.com/sahyam2023/employee-salary/main/Salary_Data.csv"

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(url)


**CHECKING IF CSV IS PROPERLY IMPORTED BY RUNNING HEAD**




In [None]:
df.head()

# **DATA CLEANING STARTS FROM HERE**




In [None]:
# Checking for missing values
print(df.isnull().sum())


In [None]:
# Handling missing data: Drop rows with missing values
df = df.dropna()

# df['column_name'].fillna(df['column_name'].mean(), inplace=True)



In [None]:
# Checking for errors and inconsistencies  it excludes the repeating ones
print("Unique values in 'Gender':", df['Gender'].unique())
print("Unique values in 'Education Level':", df['Education Level'].unique())
print("Unique values in 'Job Title':", df['Job Title'].unique())


In [None]:
# Standardizing formats
df['Gender'] = df['Gender'].str.lower()
df['Education Level'] = df['Education Level'].str.lower()
df['Job Title'] = df['Job Title'].str.lower()


In [None]:
# Displaying cleaned dataset
print(df.head())


# **EDA STARTS FROM HERE (EXPLANATORY DATA ANALYSIS)**

In [None]:
# Descriptive statistics of the dataset
print(df.describe())


**Explore the distribution of `Age`**


In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['Age'], bins=20, kde=True, color='orange')
plt.title('Distribution of Age')
plt.show()



**Impact of `Education Level` on `Salary`**

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='Education Level', y='Salary', data=df, palette='viridis')
plt.title('Impact of Education Level on Salary')
plt.xticks(rotation=45)
plt.show()


**Impact of `Job title` on `salary`**

In [None]:
# Identify the top 10 most repeated job titles
top_job_titles = df['Job Title'].value_counts().nlargest(10).index

# Filter the DataFrame to include only the rows with the top job titles
df_top_jobs = df[df['Job Title'].isin(top_job_titles)]

# Visualize the impact of 'Job Title' on 'Salary' for the top 10 job titles
plt.figure(figsize=(14, 6))
sns.boxplot(x='Job Title', y='Salary', data=df_top_jobs, palette='mako')
plt.title('Impact of Job Title on Salary (Top 10)')
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.tight_layout()
plt.show()



**Impact of `Gender` on `Salary`**

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='Gender', y='Salary', data=df, palette='pastel')
plt.title('Impact of Gender on Salary')
plt.show()


**Average `Salary` for Each `Job Title`:**

In [None]:
# Get the top 10 most repeated job titles
top_job_titles = df['Job Title'].value_counts().head(10).index

df_top_jobs = df[df['Job Title'].isin(top_job_titles)]

plt.figure(figsize=(14, 6))
sns.barplot(x='Job Title', y='Salary', data=df_top_jobs, palette='viridis')
plt.title('Average Salary for Top 10 Most Repeated Job Titles')
plt.xticks(rotation=30, ha='right')  
plt.show()


**`Salary Variation` with `Experience`**

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='Years of Experience', y='Salary', data=df, ci=None)
plt.title('Salary Variation with Experience')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()


**Highest-Paying `Job Titles`**

In [None]:
top_jobs = df.groupby('Job Title')['Salary'].mean().nlargest(10).index
plt.figure(figsize=(14, 6))
sns.barplot(x='Job Title', y='Salary', data=df[df['Job Title'].isin(top_jobs)], palette='viridis')
plt.title('Top 10 Highest-Paying Job Titles')
plt.xticks(rotation=45, ha='right')
plt.show()


**Lowest-Paying `Job Titles`**

In [None]:
bottom_jobs = df.groupby('Job Title')['Salary'].mean().nsmallest(10).index
plt.figure(figsize=(14, 6))
sns.barplot(x='Job Title', y='Salary', data=df[df['Job Title'].isin(bottom_jobs)], palette='viridis')
plt.title('Top 10 Lowest-Paying Job Titles')
plt.xticks(rotation=45, ha='right')
plt.show()


# **TRAINING AND TESTING PARTS START FROM HERE**

In [None]:
# Extract features and target variable for regression
X_reg = df[['Years of Experience', 'Age', 'Education Level', 'Job Title', 'Gender']]
y_reg = df['Salary']


In [None]:
# Split the dataset into training and testing sets for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

**ONE-HOT ENCODING HERE**

In [None]:
# One-hot encode categorical variables (Job Title, Gender, Education Level)
X_train_encoded = pd.get_dummies(X_train_reg, columns=['Job Title', 'Gender', 'Education Level'], drop_first=True)
X_test_encoded = pd.get_dummies(X_test_reg, columns=['Job Title', 'Gender', 'Education Level'], drop_first=True)

In [None]:
# Align feature names between training and test sets
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

**Initializing and train Random Forest, KNN, and Decision Tree models**


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Create and fit the Random Forest Regression model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_encoded, y_train_reg)

# Make predictions on the training data
y_train_pred = rf_model.predict(X_train_encoded)

# Calculate and print training performance metrics
mse_train = mean_squared_error(y_train_reg, y_train_pred)
r2_train = r2_score(y_train_reg, y_train_pred)

print("Training Mean Squared Error:", mse_train)
print("Training R-squared:", r2_train)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Create a Linear Regression model
linear_reg = LinearRegression()

# Fit the model to the training data
linear_reg.fit(X_train_encoded, y_train_reg)

# Make predictions on the testing data
y_pred = linear_reg.predict(X_test_encoded)

# Evaluate the model's performance
mse = mean_squared_error(y_test_reg, y_pred)
mae = mean_absolute_error(y_test_reg, y_pred)
r2 = r2_score(y_test_reg, y_pred)

# Print the evaluation metrics
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)


In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Create and fit the KNN regression model
knn_model = KNeighborsRegressor()
knn_model.fit(X_train_encoded, y_train_reg)

# Make predictions on the training data
y_train_pred = knn_model.predict(X_train_encoded)

# Calculate and print training performance metrics
mse_train = mean_squared_error(y_train_reg, y_train_pred)
r2_train = r2_score(y_train_reg, y_train_pred)

print("Training Mean Squared Error:", mse_train)
print("Training R-squared:", r2_train)


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Create and fit the Decision Tree Regression model
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train_encoded, y_train_reg)

# Make predictions on the training data
y_train_pred = tree_model.predict(X_train_encoded)

# Calculate and print training performance metrics
mse_train = mean_squared_error(y_train_reg, y_train_pred)
r2_train = r2_score(y_train_reg, y_train_pred)

print("Training Mean Squared Error:", mse_train)
print("Training R-squared:", r2_train)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Create and fit the KNN Regression model
knn_model = KNeighborsRegressor()
knn_model.fit(X_train_encoded, y_train_reg)

# Create and fit the Random Forest Regression model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_encoded, y_train_reg)

# Create and fit the Decision Tree Regression model
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train_encoded, y_train_reg)

# Define a function to print performance metrics
def print_metrics(model, X, y, data_type):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"{data_type} Mean Squared Error:", mse)
    print(f"{data_type} R-squared:", r2)

# Print training performance metrics for all models
print("Training Metrics:")
print("K-Nearest Neighbors:")
print_metrics(knn_model, X_train_encoded, y_train_reg, "Training")
print("\nRandom Forest:")
print_metrics(rf_model, X_train_encoded, y_train_reg, "Training")
print("\nDecision Tree:")
print_metrics(tree_model, X_train_encoded, y_train_reg, "Training")

# Print testing performance metrics for all models
print("\nTesting Metrics:")
print("K-Nearest Neighbors:")
print_metrics(knn_model, X_test_encoded, y_test_reg, "Testing")
print("\nRandom Forest:")
print_metrics(rf_model, X_test_encoded, y_test_reg, "Testing")
print("\nDecision Tree:")
print_metrics(tree_model, X_test_encoded, y_test_reg, "Testing")


In [None]:
from sklearn.metrics import mean_squared_error

# Assuming you have already trained the models and made predictions
y_pred_linear = linear_reg.predict(X_test_encoded)
y_pred_knn = knn_model.predict(X_test_encoded)
y_pred_rf = rf_model.predict(X_test_encoded)
y_pred_tree = tree_model.predict(X_test_encoded)

# Calculate Mean Squared Error for each model
mse_linear = mean_squared_error(y_test_reg, y_pred_linear)
mse_knn = mean_squared_error(y_test_reg, y_pred_knn)
mse_rf = mean_squared_error(y_test_reg, y_pred_rf)
mse_tree = mean_squared_error(y_test_reg, y_pred_tree)


In [None]:
from sklearn.metrics import r2_score

# Assuming you have already trained the models and made predictions
y_pred_linear = linear_reg.predict(X_test_encoded)
y_pred_knn = knn_model.predict(X_test_encoded)
y_pred_rf = rf_model.predict(X_test_encoded)
y_pred_tree = tree_model.predict(X_test_encoded)

# Calculate R-squared (R2) for each model
r2_linear = r2_score(y_test_reg, y_pred_linear)
r2_knn = r2_score(y_test_reg, y_pred_knn)
r2_rf = r2_score(y_test_reg, y_pred_rf)
r2_tree = r2_score(y_test_reg, y_pred_tree)


In [None]:
import matplotlib.pyplot as plt

# Define the model names and their respective performance metrics
models = ['Linear Regression', 'KNN', 'Random Forest', 'Decision Tree']
mse_scores = [mse_linear, mse_knn, mse_rf, mse_tree]  # Replace with your actual MSE values
r2_scores = [r2_linear, r2_knn, r2_rf, r2_tree]  # Replace with your actual R2 values

# Create a bar chart for Mean Squared Error
plt.figure(figsize=(10, 6))
plt.bar(models, mse_scores, color='royalblue')
plt.title('Mean Squared Error for Different Regression Models')
plt.xlabel('Model')
plt.ylabel('Mean Squared Error')
plt.xticks(rotation=15)
plt.show()

# Create a bar chart for R-squared
plt.figure(figsize=(10, 6))
plt.bar(models, r2_scores, color='forestgreen')
plt.title('R-squared for Different Regression Models')
plt.xlabel('Model')
plt.ylabel('R-squared')
plt.xticks(rotation=15)
plt.show()


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Assuming you have already trained the models and made predictions on the test data
# Replace y_pred_linear, y_pred_knn, y_pred_rf, y_pred_tree with your actual predictions
y_pred_linear = linear_reg.predict(X_test_encoded)
y_pred_knn = knn_model.predict(X_test_encoded)
y_pred_rf = rf_model.predict(X_test_encoded)
y_pred_tree = tree_model.predict(X_test_encoded)

# Calculate Mean Squared Error (MSE) for each model
mse_linear = mean_squared_error(y_test_reg, y_pred_linear)
mse_knn = mean_squared_error(y_test_reg, y_pred_knn)
mse_rf = mean_squared_error(y_test_reg, y_pred_rf)
mse_tree = mean_squared_error(y_test_reg, y_pred_tree)

# Calculate R-squared (R2) for each model
r2_linear = r2_score(y_test_reg, y_pred_linear)
r2_knn = r2_score(y_test_reg, y_pred_knn)
r2_rf = r2_score(y_test_reg, y_pred_rf)
r2_tree = r2_score(y_test_reg, y_pred_tree)

# Create a dictionary to store the model names and their respective MSE and R2 values
model_metrics = {
    'Linear Regression': (mse_linear, r2_linear),
    'KNN': (mse_knn, r2_knn),
    'Random Forest': (mse_rf, r2_rf),
    'Decision Tree': (mse_tree, r2_tree)
}

# Determine the best model based on MSE and R2
best_model = min(model_metrics, key=lambda model: (model_metrics[model][0], -model_metrics[model][1]))

print(f"The best model for predicting salary is: {best_model}")
print(f"Mean Squared Error: {model_metrics[best_model][0]}")
print(f"R-squared (R2): {model_metrics[best_model][1]}")


**Predict on the test set for all models**


In [None]:
y_pred_rf = rf_model.predict(X_test_encoded)
y_pred_knn = knn_model.predict(X_test_encoded)
y_pred_tree = tree_model.predict(X_test_encoded)


**Evaluate the models**


In [None]:
mse_tree = mean_squared_error(y_test_reg, y_pred_tree)
print(f'Mean Squared Error (Decision Tree): {mse_tree}')

In [None]:
mse_knn = mean_squared_error(y_test_reg, y_pred_knn)
print(f'Mean Squared Error (KNN): {mse_knn}')

**Model Selection: SELECTING RANDOM FOREST AS IT GOT THE LOWEST MEAN SQUARE**

In [None]:
mse_rf = mean_squared_error(y_test_reg, y_pred_rf)
print(f'Mean Squared Error (Random Forest): {mse_rf}')

In [None]:
# Assuming 'df' is your DataFrame
feature_names = df.columns

# Print the feature names
print("Feature Names:")
for feature in feature_names:
    print(feature)


In [None]:
# Assuming 'X_train_encoded' is your training dataset
# If you have multiple features, it will show a subset due to the display limit
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)  # Show all rows
print(X_train_encoded)


**Accuracy Check**

In [None]:
# Compare predicted salaries to actual salaries
df_results = pd.DataFrame({'Actual Salary': y_test_reg, 'Predicted Salary (Random Forest)': y_pred_rf})
print(df_results.head(10))  # Display the first 10 rows for comparison


In [None]:
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test_reg, y_pred_rf)
print(f'Mean Squared Error: {mse}')

In [None]:
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test_reg, y_pred_rf)
print(f'Mean Absolute Error: {mae}')

In [None]:
# Calculate R-squared (R2) score
r2 = r2_score(y_test_reg, y_pred_rf)
print(f'R-squared (R2) Score: {r2}')

In [None]:
# Display the first 10 rows for comparison
df_results = pd.DataFrame({'Actual Salary': y_test_reg, 'Predicted Salary (Random Forest)': y_pred_rf})
print(df_results.head(10))

**Comparison of Actual and Predicted Salaries using Random Forest**

In [None]:
# Compare predicted salaries to actual salaries
df_results = pd.DataFrame({'Actual Salary': y_test_reg, 'Predicted Salary (Random Forest)': y_pred_rf})

# Display the first 10 rows for comparison
print(df_results.head(10))

# Plot the first 100 samples for visual comparison
plt.figure(figsize=(12, 6))
sns.lineplot(x=range(1, 101), y='Actual Salary', data=df_results.head(100), label='Actual Salary')
sns.lineplot(x=range(1, 101), y='Predicted Salary (Random Forest)', data=df_results.head(100), label='Predicted Salary (Random Forest)')
plt.title('Actual vs Predicted Salaries')
plt.xlabel('Sample')
plt.ylabel('Salary')
plt.legend()
plt.show()

**Creating and Visualizing Predictions for KNN and Decision Tree Models**

In [None]:

# predictions from your models
predictions_knn = knn_model.predict(X_test_encoded)
predictions_tree = tree_model.predict(X_test_encoded)

# Create a DataFrame for predictions with the same index as the test set
df_predictions = pd.DataFrame({
    'Predicted_Salary_KNN': predictions_knn,
    'Predicted_Salary_Tree': predictions_tree,
}, index=X_test_encoded.index)

# Concatenate the predictions DataFrame with the original DataFrame
df = pd.concat([df, df_predictions], axis=1)

# Drop rows with missing values in Salary or Predicted_Salary_KNN
df_plot_knn = df[['Salary', 'Predicted_Salary_KNN']].dropna()

In [None]:
# Drop rows with missing values in Salary or Predicted_Salary_Tree
df_plot_tree = df[['Salary', 'Predicted_Salary_Tree']].dropna()

In [None]:
# Drop duplicate columns
df_plot_knn = df_plot_knn.loc[:, ~df_plot_knn.columns.duplicated()]
df_plot_tree = df_plot_tree.loc[:, ~df_plot_tree.columns.duplicated()]

# VISUALIZATION OF ACTUAL VS PREDICTED SALARY/OTHER GRAPHS

**`Actual` vs. `Predicted` Values for KNN**


In [None]:
# Visualize the actual vs. predicted values for KNN
plt.scatter(df_plot_knn['Salary'], df_plot_knn['Predicted_Salary_KNN'], alpha=0.5)
plt.title('KNN: Actual vs. Predicted Salary')
plt.xlabel('Actual Salary')
plt.ylabel('Predicted Salary (KNN)')
plt.show()


**`Actual` vs. `Predicted` Values for Decision Tree**


In [None]:
plt.scatter(df_plot_tree['Salary'], df_plot_tree['Predicted_Salary_Tree'], alpha=0.5)
plt.title('Decision Tree: Actual vs. Predicted Salary')
plt.xlabel('Actual Salary')
plt.ylabel('Predicted Salary (Tree)')
plt.show()

**Scatter Plot for `Random Forest Regression`**

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test_reg, y_pred_rf, alpha=0.5)
plt.title('Random Forest Regression: Actual vs. Predicted Values')
plt.xlabel('Actual Salary')
plt.ylabel('Predicted Salary')
plt.show()

**Bar chart for `Random Forest Feature Importance`**


In [None]:
feature_importance = pd.Series(rf_model.feature_importances_, index=X_train_encoded.columns)
feature_importance.nlargest(10).plot(kind='barh', figsize=(10, 6))
plt.title('Top 10 Feature Importance - Random Forest')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()

**Predicting salary based on experience**

In [None]:

def predict_salary_random_forest(model, input_data, feature_names):
    # Align feature names
    input_data = input_data.reindex(columns=feature_names, fill_value=0)
    
    # Predict using the Random Forest model
    predicted_salary = model.predict(input_data)
    return predicted_salary[0]

if __name__ == "__main__":
    rf_model = RandomForestRegressor(random_state=42)
    X_reg = df[['Years of Experience', 'Age', 'Education Level', 'Job Title', 'Gender']]
    y_reg = df['Salary']
    
    X_reg_encoded = pd.get_dummies(X_reg, columns=['Education Level', 'Job Title', 'Gender'], drop_first=True)
    
    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
        X_reg_encoded, y_reg, test_size=0.2, random_state=42
    )
    
    # Fit the Random Forest model
    rf_model.fit(X_train_reg, y_train_reg)

    # Get the feature names
    feature_names = X_train_reg.columns.tolist()

    print("Enter the number of years of experience to predict salary using Random Forest:")
    try:
        years_of_experience = float(input().strip())
        # Check if the input is non-negative
        if years_of_experience < 0:
            raise ValueError("Invalid input. Please enter a non-negative numeric value for years of experience.")
        
        # Preprocess the input for prediction
        #default values 0 are set below, set accordingly
        input_data = pd.DataFrame({
            'Years of Experience': [years_of_experience],
            'Age': [0],  
            'Education Level_high school': [0], 
            'Education Level_master\'s degree': [0],
            'Education Level_phd': [0],
            'Job Title_data analyst': [0],
            'Job Title_data scientist': [0],
            'Job Title_product manager': [0],
            'Job Title_sales associate': [0],
            'Job Title_senior manager': [0],
            'Job Title_software engineer': [0],
            'Gender_male': [0],
        })
        
        predicted_salary_rf = predict_salary_random_forest(rf_model, input_data, feature_names)
        print(f"Predicted Salary using Random Forest: ${predicted_salary_rf:.2f}")
    except ValueError as ve:
        print(ve)
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
