In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [40]:
data = pd.read_csv('./instagram_reach.csv')

In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         100 non-null    int64 
 1   S.No               100 non-null    int64 
 2   USERNAME           100 non-null    object
 3   Caption            94 non-null     object
 4   Followers          100 non-null    int64 
 5   Hashtags           100 non-null    object
 6   Time since posted  100 non-null    object
 7   Likes              100 non-null    int64 
dtypes: int64(4), object(4)
memory usage: 6.4+ KB


In [42]:
data.isnull().sum()

Unnamed: 0           0
S.No                 0
USERNAME             0
Caption              6
Followers            0
Hashtags             0
Time since posted    0
Likes                0
dtype: int64

In [43]:
data.dtypes

Unnamed: 0            int64
S.No                  int64
USERNAME             object
Caption              object
Followers             int64
Hashtags             object
Time since posted    object
Likes                 int64
dtype: object

In [44]:
data.duplicated().sum()

0

In [45]:
data.head()

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30


In [46]:
data['Caption'].fillna(data['Caption'].mode()[0], inplace = True)

In [47]:
data.isnull().sum()

Unnamed: 0           0
S.No                 0
USERNAME             0
Caption              0
Followers            0
Hashtags             0
Time since posted    0
Likes                0
dtype: int64

In [49]:
data = data.drop(columns=['S.No'])

In [58]:
data['Time since posted'] = data['Time since posted'].str.extract('(\d+)').astype(float)

In [59]:
data.head()

Unnamed: 0.1,Unnamed: 0,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
0,0,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11.0,139
1,1,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2.0,23
2,2,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2.0,25
3,3,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3.0,49
4,4,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3.0,30


In [60]:
X = data[['USERNAME', 'Caption', 'Hashtags', 'Followers']]
y_likes = data['Likes']
y_time_since_posted = data['Time since posted']

In [61]:
X_train, X_test, y_likes_train, y_likes_test, y_time_train, y_time_test = train_test_split(
    X, y_likes, y_time_since_posted, test_size=0.2, random_state=42
)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model_likes = LinearRegression()
model_likes.fit(X_train_scaled, y_likes_train)

In [None]:
predictions_likes = model_likes.predict(X_test_scaled)

In [None]:
mse_likes = mean_squared_error(y_likes_test, predictions_likes)
print(f'Mean Squared Error (Likes): {mse_likes}')

In [None]:
model_time = LinearRegression()
model_time.fit(X_train_scaled, y_time_train)


In [None]:
predictions_time = model_time.predict(X_test_scaled)

In [None]:
mse_time = mean_squared_error(y_time_test, predictions_time)
print(f'Mean Squared Error (Time Since Posted): {mse_time}')