In [167]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [168]:
df = pd.read_csv("advertising.csv")

In [169]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,27-03-2016 00:53,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,04-04-2016 01:39,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,13-03-2016 20:35,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,10-01-2016 02:31,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,03-06-2016 03:36,0


In [170]:
# Check if there are any duplicates
df.duplicated().sum()

0

In [171]:
# Check if there is any missing data
df.isnull().sum()

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Ad Topic Line               0
City                        0
Male                        0
Country                     0
Timestamp                   0
Clicked on Ad               0
dtype: int64

In [172]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.2+ KB


In [173]:
df.describe()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,65.0002,36.009,55000.00008,180.0001,0.481,0.5
std,15.853615,8.785562,13414.634022,43.902339,0.499889,0.50025
min,32.6,19.0,13996.5,104.78,0.0,0.0
25%,51.36,29.0,47031.8025,138.83,0.0,0.0
50%,68.215,35.0,57012.3,183.13,0.0,0.5
75%,78.5475,42.0,65470.635,218.7925,1.0,1.0
max,91.43,61.0,79484.8,269.96,1.0,1.0


In [174]:
# Calculate statistics
statistics = df.describe()

# Save the statistics to a new CSV file
statistics.to_csv('Statistics.csv')

print("Statistics saved to Statistics.csv")

Statistics saved to Statistics.csv


In [175]:
# Convert "Timestamp" to datetime feature instead of "object"
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
df["Timestamp"][:5]

0   2016-03-27 00:53:00
1   2016-04-04 01:39:00
2   2016-03-13 20:35:00
3   2016-10-01 02:31:00
4   2016-03-06 03:36:00
Name: Timestamp, dtype: datetime64[ns]

# Exploratory Data Analysis¶


In [176]:
# Number of examples for each target feature value "Clicked on Ad"
df["Clicked on Ad"].value_counts()

0    500
1    500
Name: Clicked on Ad, dtype: int64

In [177]:
# Separate data based on 'Clicked on Ad' column
clicked_on_ad_0 = df[df['Clicked on Ad'] == 0]
clicked_on_ad_1 = df[df['Clicked on Ad'] == 1]

# Save separated data into CSV files
clicked_on_ad_0.to_csv('clicked_on_ad_0.csv', index=False)
clicked_on_ad_1.to_csv('clicked_on_ad_1.csv', index=False)

# Version 2 

In [178]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [179]:
# Load data
clicked_on_ad_0 = pd.read_csv('clicked_on_ad_0.csv')
clicked_on_ad_1 = pd.read_csv('clicked_on_ad_1.csv')

In [180]:
clicked_on_ad_0.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:00,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:00,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:00,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-10-01 02:31:00,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-03-06 03:36:00,0


In [181]:
clicked_on_ad_1.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,66.0,48,24593.33,131.76,Reactive local challenge,Port Jefferybury,1,Australia,2016-07-03 01:40:00,1
1,47.64,49,45632.51,122.02,Centralized neutral neural-net,West Brandonton,0,Qatar,2016-03-16 20:19:00,1
2,69.57,48,51636.92,113.12,Centralized content-based focus group,West Katiefurt,1,Egypt,2016-03-06 01:14:00,1
3,42.95,33,30976.0,143.56,Grass-roots coherent extranet,West William,0,Barbados,2016-03-24 09:31:00,1
4,63.45,23,52182.23,140.64,Persistent demand-driven interface,New Travistown,1,Spain,2016-09-03 03:41:00,1


In [182]:
clicked_on_ad_0 = clicked_on_ad_0.drop(["Ad Topic Line","City","Male","Country","Timestamp"], axis=1)
clicked_on_ad_1 = clicked_on_ad_1.drop(["Ad Topic Line","City","Male","Country","Timestamp"], axis=1)

In [183]:
clicked_on_ad_1.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Clicked on Ad
0,66.0,48,24593.33,131.76,1
1,47.64,49,45632.51,122.02,1
2,69.57,48,51636.92,113.12,1
3,42.95,33,30976.0,143.56,1
4,63.45,23,52182.23,140.64,1


In [184]:
combined_df = pd.concat([clicked_on_ad_0, clicked_on_ad_1], ignore_index=True)

In [185]:
combined_df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Clicked on Ad
0,68.95,35,61833.9,256.09,0
1,80.23,31,68441.85,193.77,0
2,69.47,26,59785.94,236.5,0
3,74.15,29,54806.18,245.89,0
4,68.37,35,73889.99,225.58,0


In [186]:
combined_df.tail()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Clicked on Ad
995,43.7,28,63126.96,173.01,1
996,72.97,30,71384.57,208.58,1
997,51.3,45,67782.17,134.42,1
998,51.63,51,42415.72,120.37,1
999,45.01,26,29875.8,178.35,1


In [187]:
combined_df.describe

<bound method NDFrame.describe of      Daily Time Spent on Site  Age  Area Income  Daily Internet Usage  \
0                       68.95   35     61833.90                256.09   
1                       80.23   31     68441.85                193.77   
2                       69.47   26     59785.94                236.50   
3                       74.15   29     54806.18                245.89   
4                       68.37   35     73889.99                225.58   
..                        ...  ...          ...                   ...   
995                     43.70   28     63126.96                173.01   
996                     72.97   30     71384.57                208.58   
997                     51.30   45     67782.17                134.42   
998                     51.63   51     42415.72                120.37   
999                     45.01   26     29875.80                178.35   

     Clicked on Ad  
0                0  
1                0  
2                0  
3    

# Version 3


In [188]:
import pandas as pd

# Load the data
df_New = pd.read_csv('advertising.csv')
df_New = df_New.drop(["Ad Topic Line","City","Male","Country","Timestamp"], axis=1)

In [189]:
df_New.tail()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Clicked on Ad
995,72.97,30,71384.57,208.58,1
996,51.3,45,67782.17,134.42,1
997,51.63,51,42415.72,120.37,1
998,55.55,19,41920.79,187.95,0
999,45.01,26,29875.8,178.35,1


In [190]:
# Define the number of partitions
num_partitions = 10

# Calculate the number of rows in each partition
rows_per_partition = len(df_New) // num_partitions

# Create empty list to store partitions
partitions = []

# Divide the data into partitions
for i in range(num_partitions):
    partition = data.iloc[i * rows_per_partition: (i + 1) * rows_per_partition]
    partitions.append(partition)

# Describe each partition
for i, partition in enumerate(partitions, start=1):
    print(f"Partition {i} Description:")
    print(partition.describe())
    print("\n")

Partition 1 Description:
       Daily Time Spent on Site         Age   Area Income  \
count                100.000000  100.000000    100.000000   
mean                  64.022100   35.750000  54483.851200   
std                   15.346089    8.860735  14182.101777   
min                   33.210000   20.000000  22473.080000   
25%                   51.570000   29.000000  47823.552500   
50%                   65.910000   35.000000  57605.795000   
75%                   77.540000   41.000000  64361.852500   
max                   88.910000   57.000000  76435.300000   

       Daily Internet Usage        Male  Clicked on Ad  
count             100.00000  100.000000     100.000000  
mean              174.69810    0.480000       0.520000  
std                43.50661    0.502117       0.502117  
min               105.15000    0.000000       0.000000  
25%               133.65750    0.000000       0.000000  
50%               175.19000    0.000000       1.000000  
75%               210.3600

# Version 4

In [191]:
# Assuming you want to sample 100 random rows for calculating statistics
num_samples = 100

In [None]:
# Infinite loop to continuously sample and calculate statistics
while True:
    # Wait for user input
    input("Press Enter to calculate statistics or 'q' to quit: ")
    
    # Randomly sample rows from the dataset
    random_data = df_New.sample(n=num_samples, random_state=42)
    
    # Calculate statistics
    statistics = random_data.describe()
    
    # Print the statistics
    print(statistics)


Press Enter to calculate statistics or 'q' to quit: 
       Daily Time Spent on Site         Age   Area Income  \
count                100.000000  100.000000    100.000000   
mean                  62.360100   35.150000  53702.952700   
std                   15.667204    8.485728  13904.313807   
min                   32.600000   19.000000  14548.060000   
25%                   49.422500   28.750000  44024.102500   
50%                   62.785000   34.500000  54831.660000   
75%                   75.982500   40.250000  65492.240000   
max                   89.340000   52.000000  77143.610000   

       Daily Internet Usage  Clicked on Ad  
count            100.000000     100.000000  
mean             171.341500       0.560000  
std               43.260161       0.498888  
min              105.000000       0.000000  
25%              133.582500       0.000000  
50%              167.445000       1.000000  
75%              203.450000       1.000000  
max              267.010000       1.0

# Random Forest

In [81]:
# Helper function to calculate statistics
def calculate_statistics(combined_df):
    statistics = {
        'Average': combined_df.mean(),
        'Mean': combined_df.mean(),
        'Median': combined_df.median(),
        'Mode': combined_df.mode().iloc[0],
        'Max': combined_df.max(),
        'Min': combined_df.min()
    }
    return statistics

In [82]:
# Prepare data for modeling
def prepare_data(combined_df):
    # Assuming the target variable is 'Clicked on Ad'
    X = combined_df.drop(columns=['Clicked on Ad'])
    y = combined_df['Clicked on Ad']
    return X, y

In [83]:
# Function to train Random Forest model and calculate metrics
def train_random_forest(X_train, y_train, X_test, y_test):
    # Initialize Random Forest classifier
    model = RandomForestClassifier()
    # Train the model
    model.fit(X_train, y_train)
    # Predict on test set
    y_pred = model.predict(X_test)
    # Calculate accuracy for classification
    accuracy = accuracy_score(y_test, y_pred)
    # Return accuracy
    return accuracy

In [84]:
# Calculate statistics for both datasets
statistics_clicked_on_ad_0 = calculate_statistics(clicked_on_ad_0)
statistics_clicked_on_ad_1 = calculate_statistics(clicked_on_ad_1)

In [88]:
# Prepare data for modeling from the combined DataFrame
X_combined, y_combined = prepare_data(combined_df)

# Split the combined data into clicked_on_ad_0 and clicked_on_ad_1
X_0, y_0 = X_combined[combined_df['Clicked on Ad'] == 0], y_combined[combined_df['Clicked on Ad'] == 0]
X_1, y_1 = X_combined[combined_df['Clicked on Ad'] == 1], y_combined[combined_df['Clicked on Ad'] == 1]

In [89]:
# Split data into train and test sets
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X_0, y_0, test_size=0.2, random_state=42)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.2, random_state=42)


In [90]:
# Train Random Forest model and calculate accuracy
accuracy_0 = train_random_forest(X_train_0, y_train_0, X_test_0, y_test_0)
accuracy_1 = train_random_forest(X_train_1, y_train_1, X_test_1, y_test_1)

In [91]:
print("Accuracy for clicked_on_ad_0.csv:", accuracy_0)
print("Accuracy for clicked_on_ad_1.csv:", accuracy_1)

Accuracy for clicked_on_ad_0.csv: 1.0
Accuracy for clicked_on_ad_1.csv: 1.0
