### Generate a Synthetic Dataset

In [30]:
# import neccesary libraries
import pandas as pd
import numpy as np
from faker import Faker
import random

# initialize faker
fake = Faker()

# set the seed for reproducibility
np.random.seed(42)
random.seed(42)

# generate the data
data = {
    'id': range(1, 101),
    'name': [fake.name() for _ in range(100)],
    'age': np.random.randint(18, 60, size=100),
    'city': [random.choice(['New York', 
                            'Los Angeles', 
                            'Chicago', 
                            'Houston', 
                            'Phoenix']) for _ in range(100)],
    'salary': np.random.randint(30000, 150000, size=100),
    'join_date': [fake.date_between(start_date='-10y', 
                                    end_date='today') for _ in range(100)]
}

# create dataframe from the generated data
df = pd.DataFrame(data)

# save dataframe to csv file
df.to_csv('synthetic_data.csv', index=False)

# reading the csv
df = pd.read_csv('synthetic_data.csv')

# display the first few rows of the dataframe
print(df.head())
print("\n" + "-"*40 + "\n")

   id               name  age         city  salary   join_date
0   1  Melody Cunningham   56     New York   38392  2022-01-20
1   2       Laura French   46     New York   60535  2022-11-18
2   3        Brandi Reed   32      Chicago  108603  2015-03-24
3   4    Douglas Kennedy   25  Los Angeles  143569  2016-11-16
4   5        Tammy Evans   38  Los Angeles   82256  2015-04-15

----------------------------------------



In [19]:
# viewing the data
print(df.tail())
print("\n" + "-"*40 + "\n")

# summary statistics
print(df.describe())
print("\n" + "-"*40 + "\n")

# information about dataframe
print(df.info())

     id                   name  age         city  salary   join_date
95   96            Eric Burton   59      Houston   96199  2020-09-02
96   97          James Wheeler   56      Houston   64766  2017-06-18
97   98           Teresa Smith   58  Los Angeles  103530  2016-07-15
98   99  Miss Margaret Miranda   45      Chicago  123557  2023-01-20
99  100        Nancy Hendricks   24  Los Angeles   91087  2023-08-15

----------------------------------------

               id         age         salary
count  100.000000  100.000000     100.000000
mean    50.500000   37.910000   94100.250000
std     29.011492   12.219454   35805.816201
min      1.000000   18.000000   30206.000000
25%     25.750000   26.750000   60421.250000
50%     50.500000   38.000000   96520.500000
75%     75.250000   46.250000  124892.750000
max    100.000000   59.000000  149324.000000

----------------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 co

In [33]:
# selecting columns
print(df['name'])
print("\n" + "-"*40 + "\n")

# selecting multiple columns
print(df[['name', 'city']])
print("\n" + "-"*40 + "\n")

selected_columns = df[['name', 'city']]
print("Selected 'name' and 'city' columns:")
print("\n" + "-"*40 + "\n")
print(selected_columns)

# selecting rows by index
print(df.iloc[0]) # first row
print("\n" + "-"*40 + "\n")

# selecting rows by condition
print(df[df['age'] > 30])
print("\n" + "-"*40 + "\n")

0     Melody Cunningham
1          Laura French
2           Brandi Reed
3       Douglas Kennedy
4           Tammy Evans
            ...        
95          Amy Daniels
96          Karen Reyes
97       Patrick Maddox
98    Christopher Perez
99        Jimmy Fleming
Name: name, Length: 100, dtype: object

----------------------------------------

                 name         city
0   Melody Cunningham     New York
1        Laura French     New York
2         Brandi Reed      Chicago
3     Douglas Kennedy  Los Angeles
4         Tammy Evans  Los Angeles
..                ...          ...
95        Amy Daniels      Houston
96        Karen Reyes      Houston
97     Patrick Maddox  Los Angeles
98  Christopher Perez      Chicago
99      Jimmy Fleming  Los Angeles

[100 rows x 2 columns]

----------------------------------------

Selected 'name' and 'city' columns:

----------------------------------------

                 name         city
0   Melody Cunningham     New York
1        Laura Fre

In [29]:
# handing missing values

# drop rows with missing values
df.clean = df.dropna()

# fill missing values with 0
df.clean = df.fillna(0)

df = df.clean

print(df.head())

   id               name  age         city  salary   join_date
0   1  Christine Carlson   56     New York   38392  2022-08-18
1   2        James Adams   46     New York   60535  2017-07-17
2   3       Linda Nelson   32      Chicago  108603  2018-09-01
3   4     David Mcintosh   25  Los Angeles  143569  2019-11-07
4   5  Nicholas Thompson   38  Los Angeles   82256  2018-01-18
