In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Loading the dataset
df = pd.read_csv('/kaggle/input/social.csv')

# Displaying the first few rows of the dataframe
df.head()

In [None]:
# Dropping unnecessary columns
df.drop(columns='Unnamed: 0.1', inplace=True)
df.rename(columns={'Unnamed: 0': 'Id'}, inplace=True)

In [None]:
# Checking for missing values
df.isnull().sum()

# Displaying data types of columns
df.dtypes

In [None]:
# Converting Timestamp to datetime format
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Day'] = df['Timestamp'].dt.day
df['Month'] = df['Timestamp'].dt.month
df['Year'] = df['Timestamp'].dt.year

In [None]:
# Stripping leading and trailing spaces from string columns
df['Text'] = df['Text'].str.strip()
df['Sentiment'] = df['Sentiment'].str.strip()
df['User'] = df['User'].str.strip()
df['Platform'] = df['Platform'].str.strip()
df['Hashtags'] = df['Hashtags'].str.strip()
df['Country'] = df['Country'].str.strip()

In [None]:
# Exploratory Data Analysis (EDA)

# Sentiment distribution
df['Sentiment'].value_counts().nlargest(10).plot(kind='bar')
plt.title('Top 10 Sentiments based on Text')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
# Platform distribution
df['Platform'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Percentages of Platforms')
plt.legend()
plt.show()

In [None]:
# Country distribution
df['Country'].value_counts().nlargest(10).plot(kind='bar')
plt.title('Top 10 Countries')
plt.xlabel('Country')
plt.ylabel('Count')
plt.show()

In [None]:
# Hashtags distribution
df['Hashtags'].value_counts().nlargest(10).plot(kind='bar')
plt.title('Top 10 Hashtags')
plt.xlabel('Hashtags')
plt.ylabel('Count')
plt.show()

In [None]:
# Descriptive statistics
df.describe()


In [None]:
# Analyzing numerical columns
numerical_columns = df[['Day', 'Month', 'Year', 'Likes', 'Retweets']]
for col in numerical_columns.columns:
    print(f"Minimum {col}: {df[col].min()} | Maximum {col}: {df[col].max()}")

In [None]:
# Top 10 hashtags by retweets
H_R = df.groupby('Hashtags')['Retweets'].max().nlargest(10).sort_values(ascending=False).plot(kind='bar')
plt.title('Top 10 Hashtags Retweeted')
plt.xlabel('Hashtags')
plt.ylabel('Count')
plt.show()

In [None]:
# Platform with the most likes
top_likes_platform = df.groupby('Platform')['Likes'].sum().nlargest(10)
top_likes_platform.plot(kind='bar')
plt.title('Top Platforms by Total Likes')
plt.xlabel('Platform')
plt.ylabel('Total Likes')
plt.show()

In [None]:
# Country with the most likes
top_country_likes = df.groupby('Country')['Likes'].sum().nlargest(10)
top_country_likes.plot(kind='bar')
plt.title('Top Countries by Likes')
plt.xlabel('Country')
plt.ylabel('Count')
plt.show()


In [None]:
# Segmentation by platform

# Facebook
Facebook = df[df['Platform'] == 'Facebook']

In [None]:
# Top 10 hashtags retweeted on Facebook
H_R_f = Facebook.groupby('Hashtags')['Retweets'].max().nlargest(10).sort_values(ascending=False)
H_R_f.plot(kind='bar')
plt.title('Top 10 Hashtags Retweeted on Facebook')
plt.xlabel('Hashtags')
plt.ylabel('Count')
plt.show()

In [None]:
# Top users by likes on Facebook
top_likes_platform_F = Facebook.groupby('User')['Likes'].sum().nlargest(10)
top_likes_platform_F.plot(kind='bar')
plt.title('Top Users by Total Likes on Facebook')
plt.xlabel('User')
plt.ylabel('Total Likes')
plt.show()

In [None]:
# Cumulative likes over years on Facebook
f = Facebook.groupby('Year')['Likes'].sum().reset_index()
plt.figure(figsize=(10, 6))
sns.lineplot(data=f, x='Year', y='Likes', marker='o')
for index, value in f.iterrows():
    plt.text(value['Year'], value['Likes'], str(value['Likes']), ha='left', va='bottom')
plt.title('Cumulative Likes Over Years on Facebook')
plt.xlabel('Year')
plt.ylabel('Cumulative Likes')
plt.show()

In [None]:
# Cumulative retweets over years on Facebook
f = Facebook.groupby('Year')['Retweets'].sum().reset_index()
plt.figure(figsize=(10, 6))
sns.lineplot(data=f, x='Year', y='Retweets', marker='o')
for index, value in f.iterrows():
    plt.text(value['Year'], value['Retweets'], str(value['Retweets']), ha='left', va='bottom')
plt.title('Cumulative Retweets Over Years on Facebook')
plt.xlabel('Year')
plt.ylabel('Cumulative Retweets')
plt.show()

In [None]:
# Twitter
Twitter = df[df['Platform'] == 'Twitter']


In [None]:
# Top 10 hashtags retweeted on Twitter
H_R_t = Twitter.groupby('Hashtags')['Retweets'].max().nlargest(10).sort_values(ascending=False)
H_R_t.plot(kind='bar')
plt.title('Top 10 Hashtags Retweeted on Twitter')
plt.xlabel('Hashtags')
plt.ylabel('Count')
plt.show()


In [None]:
# Top users by likes on Twitter
top_likes_platform_t = Twitter.groupby('User')['Likes'].sum().nlargest(10)
top_likes_platform_t.plot(kind='bar')
plt.title('Top Users by Total Likes on Twitter')
plt.xlabel('User')
plt.ylabel('Total Likes')
plt.show()

In [None]:
# Cumulative likes over years on Twitter
f = Twitter.groupby('Year')['Likes'].sum().reset_index()
plt.figure(figsize=(10, 6))
sns.lineplot(data=f, x='Year', y='Likes', marker='o')
for index, value in f.iterrows():
    plt.text(value['Year'], value['Likes'], str(value['Likes']), ha='left', va='bottom')
plt.title('Cumulative Likes Over Years on Twitter')
plt.xlabel('Year')
plt.ylabel('Cumulative Likes')
plt.show()

In [None]:
# Cumulative retweets over years on Twitter
f = Twitter.groupby('Year')['Retweets'].sum().reset_index()
plt.figure(figsize=(10, 6))
sns.lineplot(data=f, x='Year', y='Retweets', marker='o')
for index, value in f.iterrows():
    plt.text(value['Year'], value['Retweets'], str(value['Retweets']), ha='left', va='bottom')
plt.title('Cumulative Retweets Over Years on Twitter')
plt.xlabel('Year')
plt.ylabel('Cumulative Retweets')
plt.show()

In [None]:
# Instagram
Instagram = df[df['Platform'] == 'Instagram']


In [None]:
# Top 15 hashtags retweeted on Instagram
H_R_i = Instagram.groupby('Hashtags')['Retweets'].max().nlargest(15).sort_values(ascending=False)
H_R_i.plot(kind='bar')
plt.title('Top 15 Hashtags Retweeted on Instagram')
plt.xlabel('Hashtags')
plt.ylabel('Count')
plt.show()

In [None]:
# Top users by likes on Instagram
top_likes_platform_i = Instagram.groupby('User')['Likes'].sum().nlargest(10)
top_likes_platform_i.plot(kind='bar')
plt.title('Top Users by Total Likes on Instagram')
plt.xlabel('User')
plt.ylabel('Total Likes')
plt.show()

In [None]:
# Cumulative likes over years on Instagram
f = Instagram.groupby('Year')['Likes'].sum().reset_index()
plt.figure(figsize=(10, 6))
sns.lineplot(data=f, x='Year', y='Likes', marker='o')
for index, value in f.iterrows():
    plt.text(value['Year'], value['Likes'], str(value['Likes']), ha='left', va='bottom')
plt.title('Cumulative Likes Over Years on Instagram')
plt.xlabel('Year')
plt.ylabel('Cumulative Likes')
plt.show()


In [None]:
# Cumulative retweets over years on Instagram
f = Instagram.groupby('Year')['Retweets'].sum().reset_index()
plt.figure(figsize=(10, 6))
sns.lineplot(data=f, x='Year', y='Retweets', marker='o')
for index, value in f.iterrows():
    plt.text(value['Year'], value['Retweets'], str(value['Retweets']), ha='left', va='bottom')
plt.title('Cumulative Retweets Over Years on Instagram')
plt.xlabel('Year')
plt.ylabel('Cumulative Retweets')
plt.show()