In [2]:
# OIBSIP
# TASK 2         : Unemployment analysis with python
# AUTHOR         : Shreyas Ghodekar
# MODEL          : Linear Regression

In [3]:
# Importing
# 1. Python Modules 
# 2. Advertising Dataset

In [4]:
import pandas as pd
import numpy as np

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
df1 = pd.read_csv('Unemployment in India.csv')

In [7]:
df2 = pd.read_csv('Unemployment_Rate_upto_11_2020.csv')

In [8]:
# Data Processing

In [9]:
df1.columns

Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
       ' Estimated Employed', ' Estimated Labour Participation Rate (%)',
       'Area'],
      dtype='object')

In [10]:
df2.columns

Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
       ' Estimated Employed', ' Estimated Labour Participation Rate (%)',
       'Region.1', 'longitude', 'latitude'],
      dtype='object')

In [11]:
# Concating the datasets

In [12]:
df = pd.concat([df1,df2])

In [13]:
df

Unnamed: 0,Region,Date,Frequency,Estimated Unemployment Rate (%),Estimated Employed,Estimated Labour Participation Rate (%),Area,Region.1,longitude,latitude
0,Andhra Pradesh,31-05-2019,Monthly,3.65,11999139.0,43.24,Rural,,,
1,Andhra Pradesh,30-06-2019,Monthly,3.05,11755881.0,42.05,Rural,,,
2,Andhra Pradesh,31-07-2019,Monthly,3.75,12086707.0,43.50,Rural,,,
3,Andhra Pradesh,31-08-2019,Monthly,3.32,12285693.0,43.97,Rural,,,
4,Andhra Pradesh,30-09-2019,Monthly,5.17,12256762.0,44.68,Rural,,,
...,...,...,...,...,...,...,...,...,...,...
262,West Bengal,30-06-2020,M,7.29,30726310.0,40.39,,East,22.9868,87.855
263,West Bengal,31-07-2020,M,6.83,35372506.0,46.17,,East,22.9868,87.855
264,West Bengal,31-08-2020,M,14.87,33298644.0,47.48,,East,22.9868,87.855
265,West Bengal,30-09-2020,M,9.35,35707239.0,47.73,,East,22.9868,87.855


In [14]:
df.columns

Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
       ' Estimated Employed', ' Estimated Labour Participation Rate (%)',
       'Area', 'Region.1', 'longitude', 'latitude'],
      dtype='object')

In [15]:
# Select relevant columns for analysis
selected_columns = ['Region', ' Estimated Unemployment Rate (%)', ' Estimated Employed', ' Estimated Labour Participation Rate (%)', 'Area']
df = df[selected_columns]

# Handle missing values if any
df = df.dropna()


In [16]:
# Split the dataset into input features and target variable

In [17]:
X = df.drop(' Estimated Unemployment Rate (%)', axis=1)
y = df[' Estimated Unemployment Rate (%)']

In [18]:
# Split the dataset into training and testing sets

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
print(X_train.dtypes)
print(y_train.dtypes)

Region                                       object
 Estimated Employed                         float64
 Estimated Labour Participation Rate (%)    float64
Area                                         object
dtype: object
float64


In [21]:
X_train_encoded = pd.get_dummies(X_train, columns=['Region', 'Area'])
X_test_encoded = pd.get_dummies(X_test, columns=['Region', 'Area'])
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='outer', axis=1, fill_value=0)

In [22]:
# Train the model

In [23]:
model = LinearRegression()
model.fit(X_train_encoded, y_train)

In [24]:
y_train_pred = model.predict(X_train_encoded)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

y_test_pred = model.predict(X_test_encoded)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

print("Training set:")
print(f"RMSE: {train_rmse:.2f}")
print(f"R^2: {train_r2:.2f}")

print("\nTesting set:")
print(f"RMSE: {test_rmse:.2f}")
print(f"R^2: {test_r2:.2f}")

Training set:
RMSE: 8.04
R^2: 0.40

Testing set:
RMSE: 9.95
R^2: 0.32


In [25]:
# Custom Testing

In [26]:
# Prepare new data for prediction
new_data = pd.DataFrame({'Region': ['Region1'], 'Estimated Employed': [5000], 'Estimated Labour Participation Rate (%)': [70], 'Area': [500]})
new_data = new_data.reindex(columns=X_train_encoded.columns, fill_value=0)

# Predict unemployment rate
predicted_unemployment_rate = model.predict(new_data)
print(f"\nPredicted Unemployment Rate: {predicted_unemployment_rate[0]:.2f}%")



Predicted Unemployment Rate: 19.36%
