# iNeuron Assessment

### Machine Learning

Q1.Imagine you have a dataset where you have different Instagram features like u sername , Caption , Hashtag , Followers , Time_Since_posted , and likes , now your task is to predict the number of likes and Time Since posted and the rest of the features are your input features. Now you have to build a model which can predict the number of likes and Time Since posted. 

In [None]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
# import csv file
df = pd.read_csv("instagram_reach.csv")

In [None]:
df.head()

In [None]:
# Drop unnecessary columns
df.drop(['Unnamed: 0','S.No'],axis=1,inplace=True)
df.head()

In [None]:
df.head()

In [None]:
# Checking Null values
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
# There is no null values
df.isnull().sum()

In [None]:
# Check the shape of dataset
df.shape

In [None]:
# Checking duplicate values
df.duplicated().sum()

In [None]:
# Describing numerical data columns
df.describe()

In [None]:
# checking info of the data
df.info()

In [None]:
# drawing histplot for seeing distribution of data
plt.figure(figsize=(10,8))
sns.histplot(data=df, x='Followers', bins=30, kde=True)

In [None]:
# THrough graph it is clear that data is positively skewed

In [None]:
# we will draw pair plot to see pair wise relationship between variables
sns.pairplot(df)

In [None]:
# data contains outliers

In [None]:
# Convert the "Time since posted" column to a numerical value
df['Time since posted'] = df['Time since posted'].str.replace('hours', '').str.strip().astype(float)

In [None]:
df.info()

In [None]:
#transform categorical variables ('USERNAME', 'Caption', 'Hashtags') into numerical representations by label encoder
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

df['USERNAME']=le.fit_transform(df['USERNAME'])
df['Caption']=le.fit_transform(df['Caption'])
df['Hashtags']=le.fit_transform(df['Hashtags'])

In [None]:
df.info()

In [None]:
# Compute the correlation matrix
corr_matrix = df.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Drop the 'Likes' and 'Time since posted' columns from the DataFrame 'df' 
# and store the remaining columns in the variable 'X'
X = df.drop(columns=['Likes', 'Time since posted'])
y = df[['Likes', 'Time since posted']]


In [None]:
X

In [None]:
y

In [None]:
### Do train test split
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.20,random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
X_train_scaled=scaler.fit_transform(X_train)

In [None]:
X_test_scaled=scaler.transform(X_test)

In [None]:
# Model building
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_absolute_error, mean_squared_error

In [None]:
def evaluate_model(true, predict):
    r2=r2_score(true, predict)
    mae=mean_absolute_error(true,predict)
    mse=mean_squared_error(true,predict)
   

    return r2, mae,mse,

In [None]:
models={

    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'Randomforest':RandomForestRegressor()

}

In [None]:
for i in range(len(models)):
    model=list((models.values()))[i]
    print(model)
    
    
r2_list=[]

In [None]:
for i in range(len(models)):
    model=list(models.values())[i]
    
    model.fit(X_train,y_train)

    #make_prediction
    y_pred=model.predict(X_test)

    #this is for the validaiton
    R2,MAE,MSE=evaluate_model(y_test,y_pred)


    print("model training performance",model)
    print("MSE:", MSE)
    print("MAE:",MAE)
    print("R2 SCORE:",R2)
    
    r2_list.append(R2)

    print("="*40)
    print("\n")


Among these models, the Random Forest Regressor has the lowest MSE and MAE, and the highest R2 score. Therefore, based on these metrics, the Random Forest Regressor appears to be the best-performing model for this dataset. 