In [None]:
# Install scikit-learn for machine learning
!pip install scikit-learn

In [None]:
# Install pandas for data manipulation
!pip install pandas

#### Pandas is used for data manipulation and analysis. It provides powerful data structures like DataFrame and Series, making it easy to clean, transform, analyze, and visualize structured data efficiently.


In [19]:

import pandas as pd

#### sklearn.model_selection provides utilities for splitting datasets into training and testing sets, cross-validation, and other model selection strategies.

In [20]:
from sklearn.model_selection import train_test_split

#### The purpose of using sklearn.linear_model is to provide various linear models for regression and classification tasks. These models are used to make predictions based on linear relationships in the data.



In [21]:
from sklearn.linear_model import LinearRegression

### The purpose of using r2score (R2 score) is to evaluate the performance of a regression model. It measures how well the predicted values match the actual values, indicating the proportion of variance in the dependent variable that is predictable from the independent variables. An R2 score of 1 means perfect prediction, while 0 means the model does not explain any variability.



In [22]:
import sklearn.metrics as r2score

#### The purpose of pd.read_csv is to read a CSV (Comma Separated Values) file and load its data into a pandas DataFrame, making it easy to manipulate and analyze tabular data in Python.

In [23]:
df = pd.read_csv('Salary_Data.csv')
df

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0
5,2.9,56642.0
6,3.0,60150.0
7,3.2,54445.0
8,3.2,64445.0
9,3.7,57189.0


In [24]:
#### This code selects the "YearsExperience" column from the DataFrame df and returns it as a new DataFrame.
independent = df[["YearsExperience"]]
independent

Unnamed: 0,YearsExperience
0,1.1
1,1.3
2,1.5
3,2.0
4,2.2
5,2.9
6,3.0
7,3.2
8,3.2
9,3.7


In [25]:
#### This code selects the "Salary" column from the DataFrame df and returns it as a new DataFrame.
dependent = df[["Salary"]]
dependent

Unnamed: 0,Salary
0,39343.0
1,46205.0
2,37731.0
3,43525.0
4,39891.0
5,56642.0
6,60150.0
7,54445.0
8,64445.0
9,57189.0


In [26]:
# This code splits the independent and dependent variables into training and testing sets.
# test_size=0.3 means 30% of the data will be used for testing, and 70% for training.
# random_state=0 ensures the split is reproducible.
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.3, random_state=0)

In [27]:
# 1. regressor = LinearRegression()
#    This creates an instance of the LinearRegression model from scikit-learn.

# 2. regressor.fit(X_train, y_train)
#    This trains (fits) the linear regression model using the training data (X_train as input features and y_train as target values).

# 3. y_pred = regressor.predict(X_test)
#    This uses the trained model to predict the target values (salaries) for the test data (X_test).
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
y_pred

array([[ 40817.78327049],
       [123188.08258899],
       [ 65154.46261459],
       [ 63282.41035735],
       [115699.87356004],
       [108211.66453108],
       [116635.89968866],
       [ 64218.43648597],
       [ 76386.77615802]])

In [28]:
## To get the coefficient (slope) of the linear regression model, you can access the coef_ attribute of the trained model. 
# In this case, since we have only one independent variable (YearsExperience), the coefficient will be a single value representing the slope of the regression line.
regressor.coef_[0]

array([9360.26128619])

In [29]:
## To get the intercept (y-intercept) of the linear regression model, you can access the intercept_ attribute of the trained model.
regressor.intercept_[0]

np.float64(26777.391341197632)

In [34]:
## This code calculates the R-squared score, which indicates how well the model's predictions match the actual values.
## A score of 1.0 indicates perfect predictions, while a score of 0 indicates that the model does not explain any of the variability in the target variable.
from sklearn.metrics import r2_score


In [None]:
# Calculate R-squared score
r2scores = r2_score(y_test, y_pred)
r2scores

0.9740993407213511