In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
# Load the dataset and display the dataset contents

df = pd.read_csv("datasets/salary_data.csv")
print(df)

In [None]:
# Display top 5 rows

df.head()

In [None]:
# Display basic dataset information

df.info()

In [None]:
# Compute summary statistics of the dataset

df.describe()

In [None]:
# Select employees having experience more than 2.5 years

df[df["YearsExperience"] > 2.5]

In [None]:
# Select those employees of salary > 100000

df[df["Salary"] > 100000]

In [None]:
# Find the minimum, maximum and average salary values

min_salary = df["Salary"].min()
max_salary = df["Salary"].max()
avg_salary = df["Salary"].mean()

print(f"Minimum Salary: {min_salary}")
print(f"Maximum Salary: {max_salary}")
print(f"Average Salary: {avg_salary}")

In [None]:
# Find average salary then display employees those salary more than average salary

df[df["Salary"] > avg_salary]

In [None]:
# Find correlation between experience and salary

correlation = df["YearsExperience"].corr(df["Salary"])
print(f"Correlation between Years of Experience and Salary: {correlation:.4f}")

In [None]:
# Visualize relationship between YearsExperience and Salary or, Scatter Plot(Experience vs Salary)

plt.scatter(df["YearsExperience"], df["Salary"], color="blue")
plt.title("Experience vs Salary")
plt.xlabel("Experience")
plt.ylabel("Salary")
plt.grid(linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Train linear regression model

x_data = df[["YearsExperience"]]
y_data = df["Salary"]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(x_train, y_train)

print("Intercept:", model.intercept_)
print("Coefficient:", model.coef_[0])

In [None]:
# Predict salary for specific experience

years = [[3]]
predicted_salary = model.predict(years)
print(f"Predicted salary for {years[0][0]} years of experience: {predicted_salary[0]:.2f}")

In [None]:
# Plot regression line over data points

plt.scatter(df["YearsExperience"], df["Salary"], color="blue")
plt.plot(df["YearsExperience"], model.predict(df[["YearsExperience"]]), color="red")
plt.title("Experience vs Salary")
plt.xlabel("Experience")
plt.ylabel("Salary")
plt.grid(linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Evaluate model performance (R², MAE, MSE)

y_pred = model.predict(x_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R²: {r2:.4f}")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")

In [None]:
# Predict salary for 0-10 years of experience

exp_range = [[i] for i in range(11)]
pred_sal = model.predict(exp_range)

for year, sal in zip(range(11), pred_sal):
    print(f"Predicted salary for {year} years of experience: {sal:.2f}")

In [None]:
# Predict salary for 0-10 years of experience

exp_range = [[i] for i in range(11)]
pred_sal = model.predict(exp_range)

for year, sal in zip(exp_range, pred_sal):
    print(f"Predicted salary for {year} years of experience: {sal:.2f}")