In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, ConfusionMatrixDisplay

In [7]:
# data

company_name_list = [{'name': 'Company 1'},
          {'name': 'Company 2'},
          {'name': 'Company 3'}]

employee_name_list = [{'name': 'John Doe'},
          {'name': 'Tom Smith'},
          {'name': 'Andrew Sebastian'}]

company_detail_list = {
      'Company 1': {
          'name': 'Company 1',
          'domain': 'Retail',
          'clients': [
              {
                  'name': 'acme.inc',
                  'country': 'united states'
              },
              {
                  'name': 'Wayne.co',
                  'country': 'united states'
              }
          ]
      },
      'Company 2': {
          'name': 'Company 2',
          'domain': 'Construction',
          'clients': [
              {
                  'name': 'Tesla',
                  'country': 'united states'
              },
              {
                  'name': 'Japan Airlines',
                  'country': 'japan'
              },
              {
                  'name': 'Indofood',
                  'country': 'indonesia'
              }
          ]
      },
      'Company 3': {
          'name': 'Company 3',
          'domain': 'Healthcare',
          'clients': [
              {
                  'name': 'Petronas',
                  'country': 'malaysia'
              },
              {
                  'name': 'VW Group',
                  'country': 'germany'
              },
              {
                  'name': 'IBM',
                  'country': 'united states'
              },
              {
                  'name': 'Mitsubishi',
                  'country': 'japan'
              }
          ]
      }
  }

employee_detail_list = {
      'John Doe': {
          'name': 'EMP-0001',
          'first_name': 'John',
          'last_name': 'Doe',
          'full_name': 'John Doe',
          'company': 'Company 1'
      },
      'Tom Smith': {
          'name': 'EMP-0002',
          'first_name': 'Tom',
          'last_name': 'Smith',
          'full_name': 'Tom Smith',
          'company': 'Company 2'
      },
      'Andrew Sebastian': {
          'name': 'EMP-0003',
          'first_name': 'Andrew',
          'last_name': 'Sebastian',
          'full_name': 'Andrew Sebastian',
          'company': 'Company 2'
      },
  }


### 1

In [10]:
# Extract company details and sort by domain in reverse order
sorted_companies = sorted(company_detail_list.values(), key=lambda x: x['domain'], reverse=True)

# Create a list of dictionaries with name and domain
result = [{"name": company['name'], "domain": company['domain']} for company in sorted_companies]

print(result)


[{'name': 'Company 1', 'domain': 'Retail'}, {'name': 'Company 3', 'domain': 'Healthcare'}, {'name': 'Company 2', 'domain': 'Construction'}]


### 2

In [11]:
for company_data in company_detail_list.values():
    company_name = company_data['name']
    domain = company_data['domain']
    num_clients = len(company_data['clients'])
    print(f"{company_name}: {domain}, relation: {num_clients} clients")


Company 1: Retail, relation: 2 clients
Company 2: Construction, relation: 3 clients
Company 3: Healthcare, relation: 4 clients


### 3

In [12]:
def get_employee_details():
    employee_details = []
    for employee_data in employee_detail_list.values():
        employee_name = employee_data['full_name']
        company_name = employee_data['company']
        domain = company_detail_list[company_name]['domain']
        employee_details.append({
            "full_name": employee_name,
            "company": company_name,
            "domain": domain
        })
    return employee_details

# Call the function and print the result
print(get_employee_details())


[{'full_name': 'John Doe', 'company': 'Company 1', 'domain': 'Retail'}, {'full_name': 'Tom Smith', 'company': 'Company 2', 'domain': 'Construction'}, {'full_name': 'Andrew Sebastian', 'company': 'Company 2', 'domain': 'Construction'}]


In [13]:
def get_companies_with_employees():
    companies_with_employees = []
    for company_data in company_detail_list.values():
        company_name = company_data['name']
        employees = [employee_data['full_name'] for employee_data in employee_detail_list.values() if employee_data['company'] == company_name]
        companies_with_employees.append({
            "company": company_name,
            "employees": employees
        })
    return companies_with_employees

# Call the function and print the result
print(get_companies_with_employees())


[{'company': 'Company 1', 'employees': ['John Doe']}, {'company': 'Company 2', 'employees': ['Tom Smith', 'Andrew Sebastian']}, {'company': 'Company 3', 'employees': []}]


### Soal pre-processing data

### 1

In [21]:
import pandas as pd

# Membaca dataset
data = pd.read_csv('50_startups.csv')

# Melakukan analisis kolom mana yang memiliki data kosong (NaN)
kolom_dengan_nan = data.columns[data.isnull().any()]

# Menghitung nilai mean untuk kolom-kolom dengan data kosong
nilai_mean = data[kolom_dengan_nan].mean()

# Mengisi data kosong dengan nilai mean
data.fillna(nilai_mean, inplace=True)

# Menyimpan dataset yang sudah diisi nilai mean ke file CSV
data.to_csv('50_startups.csv', index=False)

data

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,122790.158298,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


### 2

In [25]:
import pandas as pd

# Membaca dataset
data = pd.read_csv('50_Startups.csv')

# Melakukan OneHotEncoding ke kolom 'State'
one_hot_encoded = pd.get_dummies(data['State'], prefix='State')

# Menggabungkan hasil OneHotEncoding ke dalam dataset
data = pd.concat([data, one_hot_encoded], axis=1)

# Menghapus kolom 'State' yang asli
data.drop('State', axis=1, inplace=True)

# Menyimpan dataset yang sudah diubah ke file CSV
data.to_csv('50_Startups.csv', index=False)
data

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,False,False,True
1,162597.7,151377.59,443898.53,191792.06,True,False,False
2,153441.51,101145.55,407934.54,191050.39,False,True,False
3,144372.41,118671.85,383199.62,182901.99,False,False,True
4,142107.34,91391.77,366168.42,166187.94,False,True,False
5,131876.9,99814.71,362861.36,156991.12,False,False,True
6,134615.46,147198.87,127716.82,156122.51,True,False,False
7,130298.13,122790.158298,323876.68,155752.6,False,True,False
8,120542.52,148718.95,311613.29,152211.77,False,False,True
9,123334.88,108679.17,304981.62,149759.96,True,False,False


### 3

In [28]:
import pandas as pd

# Membaca dataset
data = pd.read_csv('50_Startups.csv')

# Menghitung nilai Tax sesuai rumus
data['Tax'] = (data['Profit'] + data['Marketing Spend'] + data['Administration']) * 0.05

# Menyimpan dataset yang sudah diubah ke file CSV
data.to_csv('50_Startups.csv', index=False)

data


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York,Tax
0,165349.2,136897.8,471784.1,192261.83,False,False,True,40047.1865
1,162597.7,151377.59,443898.53,191792.06,True,False,False,39353.409
2,153441.51,101145.55,407934.54,191050.39,False,True,False,35006.524
3,144372.41,118671.85,383199.62,182901.99,False,False,True,34238.673
4,142107.34,91391.77,366168.42,166187.94,False,True,False,31187.4065
5,131876.9,99814.71,362861.36,156991.12,False,False,True,30983.3595
6,134615.46,147198.87,127716.82,156122.51,True,False,False,21551.91
7,130298.13,122790.158298,323876.68,155752.6,False,True,False,30120.971915
8,120542.52,148718.95,311613.29,152211.77,False,False,True,30627.2005
9,123334.88,108679.17,304981.62,149759.96,True,False,False,28171.0375


### 4

In [29]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Membaca dataset
data = pd.read_csv('50_Startups.csv')

# Pilih kolom yang ingin Anda skalakan
columns_to_scale = ['Profit', 'Marketing Spend', 'Administration', 'Tax']

# Inisialisasi StandardScaler
scaler = StandardScaler()
a
# Terapkan StandardScaler pada kolom-kolom yang dipilih
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

# Simpan dataset yang sudah diubah ke file CSV
data.to_csv('50_Startups.csv', index=False)

data

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York,Tax
0,165349.2,0.554318,2.299248,2.024298,False,False,True,2.460946
1,162597.7,1.123258,2.040926,2.012472,True,False,False,2.361311
2,153441.51,-0.85046,1.707769,1.993801,False,True,False,1.737043
3,144372.41,-0.161817,1.478634,1.78867,False,False,True,1.62677
4,142107.34,-1.233706,1.320863,1.367903,False,True,False,1.18857
5,131876.9,-0.902751,1.290227,1.136379,False,False,True,1.159266
6,134615.46,0.959067,-0.888067,1.114512,True,False,False,-0.195209
7,130298.13,0.0,0.929087,1.1052,False,True,False,1.035416
8,120542.52,1.018794,0.815484,1.016062,False,False,True,1.108117
9,123334.88,-0.554449,0.75405,0.9543389,True,False,False,0.755381
