In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

# Load the new dataset
data = pd.read_csv("/Employee_Salary_Dataset.csv")
print(data.head())

# Encode categorical features if necessary (Gender column in this dataset)
le = LabelEncoder()
data["Gender"] = le.fit_transform(data["Gender"])
print(data.head())

# Define features (X) and target (y)
# Include all columns except the target variable
x = data.drop("Salary", axis=1)
y = data["Salary"]

print("\nFeatures (X) head:")
print(x.head())
print("\nTarget (y) head:")
print(y.head())

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

# Decision Tree Classifier (Note: Decision Tree Classifier is for classification, not regression.
# Since the target is continuous (Salary), Linear Regression is more appropriate.
# I will keep the Decision Tree code but it's not suitable for this problem.)
dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(x_train, y_train)
y_dt_pred = dt.predict(x_test)
# Accuracy score is not appropriate for regression, using R2 for comparison
accuracy_dt = r2_score(y_test, y_dt_pred)
print("\nDecision Tree R2 (using regression-like prediction):", accuracy_dt)


# Linear Regression
lrr = LinearRegression()
lrr.fit(x_train, y_train)
y_lrr_pred = lrr.predict(x_test)

mse = mean_squared_error(y_test, y_lrr_pred)
r2 = r2_score(y_test, y_lrr_pred)

print("\nLinear Regression MSE:", mse)
print("Linear Regression R2:", r2)

   ID  Experience_Years  Age  Gender  Salary
0   1                 5   28  Female  250000
1   2                 1   21    Male   50000
2   3                 3   23  Female  170000
3   4                 2   22    Male   25000
4   5                 1   17    Male   10000
   ID  Experience_Years  Age  Gender  Salary
0   1                 5   28       0  250000
1   2                 1   21       1   50000
2   3                 3   23       0  170000
3   4                 2   22       1   25000
4   5                 1   17       1   10000

Features (X) head:
   ID  Experience_Years  Age  Gender
0   1                 5   28       0
1   2                 1   21       1
2   3                 3   23       0
3   4                 2   22       1
4   5                 1   17       1

Target (y) head:
0    250000
1     50000
2    170000
3     25000
4     10000
Name: Salary, dtype: int64

Decision Tree R2 (using regression-like prediction): -1.5088174747167384

Linear Regression MSE: 6384539505667.6