In [41]:
import pandas as pd
import os

In [42]:
def process_file(file_path):
    # Load the data từ file .txt
    df = pd.read_csv(file_path, sep='\t', engine='python')
    
    if df.shape[0] == 0:
        print(f"{file_path} is empty or has no valid data!")
        return

    # Remove rows where 'Price High' is 0
    df = df[df['Volume'] != 0]

    # Remove duplicates based on 'Date'
    df = df[df['Volume'].notna()]
    df = df.drop_duplicates(subset=['Date'], keep='first')

    # Tạo đường dẫn file output
    output_file = file_path.rsplit('.', 1)[0] + '.csv'
    output_file = output_file.replace('Korea', 'Korea_csv')

    # Tạo thư mục nếu chưa tồn tại
    output_dir = os.path.dirname(output_file)
    os.makedirs(output_dir, exist_ok=True)  # Tạo thư mục nếu chưa có

    # Lưu file CSV
    df.to_csv(output_file, index=False)
    print(f"Processed file saved to {output_file}")

In [43]:
import zipfile
import os

# Đường dẫn đến file zip
zip_path = "Korea.zip"
extract_folder = "Korea"  # Thư mục muốn giải nén vào

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

# Xác nhận các file đã giải nén
print(os.listdir(extract_folder))


['Korea']


In [44]:
txt_folder = "Korea/Korea"

# Lọc tất cả file trừ 'danh_sach.txt'
files = [file for file in os.listdir(txt_folder) if file.endswith('.txt')]
files.sort()

# Xử lý từng file
for file in files:
    file_path = f'{txt_folder}/{file}'
    process_file(file_path)

Processed file saved to Korea_csv/Korea_csv/000040.csv
Processed file saved to Korea_csv/Korea_csv/000050.csv
Processed file saved to Korea_csv/Korea_csv/000070.csv
Processed file saved to Korea_csv/Korea_csv/000080.csv
Processed file saved to Korea_csv/Korea_csv/000100.csv
Processed file saved to Korea_csv/Korea_csv/000140.csv
Processed file saved to Korea_csv/Korea_csv/000180.csv
Processed file saved to Korea_csv/Korea_csv/000220.csv
Processed file saved to Korea_csv/Korea_csv/000225.csv
Processed file saved to Korea_csv/Korea_csv/000227.csv
Processed file saved to Korea_csv/Korea_csv/000230.csv
Processed file saved to Korea_csv/Korea_csv/000240.csv
Processed file saved to Korea_csv/Korea_csv/000270.csv
Processed file saved to Korea_csv/Korea_csv/000320.csv
Processed file saved to Korea_csv/Korea_csv/000370.csv
Processed file saved to Korea_csv/Korea_csv/000390.csv
Processed file saved to Korea_csv/Korea_csv/000400.csv
Processed file saved to Korea_csv/Korea_csv/000400_r.csv
Processe

KeyError: 'Volume'

In [45]:
import pandas as pd
import os

# Đọc toàn bộ dữ liệu của các công ty từ file CSV
csv_folder = "Korea_csv/Korea_csv"
csv_files = [f for f in os.listdir(csv_folder) if f.endswith('.csv')]

# Tạo DataFrame lớn để chứa dữ liệu của tất cả công ty
all_data = []

for file in csv_files:
    file_path = f"{csv_folder}/{file}"
    company_name = file.replace('.csv', '')  # Lấy tên công ty từ tên file
    df = pd.read_csv(file_path)
    df['Company'] = company_name  # Thêm cột để phân biệt công ty
    df['Date'] = pd.to_datetime(df['Date'])  # Chuyển cột Date thành kiểu datetime
    df = df.sort_values(by='Date')  # Sắp xếp dữ liệu theo ngày
    all_data.append(df)

# Gộp dữ liệu các công ty thành một DataFrame lớn
combined_df = pd.concat(all_data, ignore_index=True)
print(f"Đọc thành công dữ liệu của {len(csv_files)} công ty với tổng số {combined_df.shape[0]} dòng.")


Đọc thành công dữ liệu của 863 công ty với tổng số 3113885 dòng.


In [46]:
combined_df

Unnamed: 0,Date,Price Open,Price Low,Price High,Price Close,Volume,Company
0,2000-01-04,60993.477914,60993.477914,69796.660293,69796.660293,1047.0,000040
1,2000-01-05,75455.848966,75455.848966,79857.440155,79857.440155,365.0,000040
2,2000-01-06,91804.616242,84259.031345,91804.616242,85516.628828,1799.0,000040
3,2000-01-07,81115.037638,76084.647707,81115.037638,76713.446449,634.0,000040
4,2000-01-10,75455.848966,75455.848966,83001.433862,83001.433862,445.0,000040
...,...,...,...,...,...,...,...
3113880,2022-12-23,7920.000000,7630.000000,7980.000000,7660.000000,62426.0,950210
3113881,2022-12-26,7710.000000,7290.000000,7710.000000,7380.000000,56055.0,950210
3113882,2022-12-27,7380.000000,7280.000000,7430.000000,7310.000000,33633.0,950210
3113883,2022-12-28,7320.000000,7190.000000,7380.000000,7320.000000,18510.0,950210


In [47]:
# Xuất toàn bộ DataFrame thành file CSV
combined_df.to_csv('combined_analysis.csv', index=False)
print("File CSV đã được lưu thành công!")

File CSV đã được lưu thành công!


In [49]:
from IPython.display import FileLink
# Tạo link tải file
FileLink('combined_analysis.csv')


In [58]:
import pandas as pd

# Đọc file Korea.xlsx
korea_df = pd.read_excel('Korea/Korea/Korea.xlsx')
combined_df = pd.read_csv('combined_analysis.csv', dtype={'Company': str})

# Chuẩn hóa và cắt cụm trước dấu chấm
korea_df['Company'] = korea_df['RIC'].str.split('.').str[0]

# Gộp file dựa trên khóa chung
merged_df = pd.merge(korea_df, combined_df, on='Company', how='left')

merged_df['Companyname'] = merged_df['Name'].factorize()[0]

merged_df = merged_df.sort_values(by=['Companyname', 'Date'])
merged_df = merged_df.drop(columns=['Companyname','Company'])

# Lưu file CSV mới
merged_df.to_csv('merged_analysis.csv', index=False)
print("File merged_analysis.csv đã được lưu thành công!")


File merged_analysis.csv đã được lưu thành công!


In [60]:
from IPython.display import FileLink
# Tạo link tải file
FileLink('merged_analysis.csv')


In [61]:
merged_df

Unnamed: 0,Name,Symbol,RIC,Start Date,Hist.,Category,Exchange,Market,Currency,Sector,Full Name,Activity,Date,Price Open,Price Low,Price High,Price Close,Volume
0,SAMSUNG ELECTRONICS,KO:SGL,005930.KS,7/2/1984,1984,Equities,Korea,South Korea,South Korean Won,Technology Hardware and Equipment,Samsung Electronics,Active,2000-01-04,6000.0,5660.0,6110.0,6110.0,74198349.0
1,SAMSUNG ELECTRONICS,KO:SGL,005930.KS,7/2/1984,1984,Equities,Korea,South Korea,South Korean Won,Technology Hardware and Equipment,Samsung Electronics,Active,2000-01-05,5800.0,5520.0,6060.0,5580.0,74680199.0
2,SAMSUNG ELECTRONICS,KO:SGL,005930.KS,7/2/1984,1984,Equities,Korea,South Korea,South Korean Won,Technology Hardware and Equipment,Samsung Electronics,Active,2000-01-06,5750.0,5580.0,5780.0,5620.0,54390499.0
3,SAMSUNG ELECTRONICS,KO:SGL,005930.KS,7/2/1984,1984,Equities,Korea,South Korea,South Korean Won,Technology Hardware and Equipment,Samsung Electronics,Active,2000-01-07,5560.0,5360.0,5670.0,5540.0,40309749.0
4,SAMSUNG ELECTRONICS,KO:SGL,005930.KS,7/2/1984,1984,Equities,Korea,South Korea,South Korean Won,Technology Hardware and Equipment,Samsung Electronics,Active,2000-01-10,5600.0,5580.0,5770.0,5770.0,46880749.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3114117,YUYU PHARMA 2PB,KO:YIN,000227.KS,6/27/2000,2000,Equities,Korea,South Korea,South Korean Won,Pharmaceuticals and Biotechnology,Yuyu Pharma 2PB,Active,2022-12-23,17550.0,15050.0,15350.0,15300.0,317.0
3114118,YUYU PHARMA 2PB,KO:YIN,000227.KS,6/27/2000,2000,Equities,Korea,South Korea,South Korean Won,Pharmaceuticals and Biotechnology,Yuyu Pharma 2PB,Active,2022-12-26,18200.0,15100.0,15500.0,15500.0,110.0
3114119,YUYU PHARMA 2PB,KO:YIN,000227.KS,6/27/2000,2000,Equities,Korea,South Korea,South Korean Won,Pharmaceuticals and Biotechnology,Yuyu Pharma 2PB,Active,2022-12-27,17650.0,15100.0,15500.0,15300.0,484.0
3114120,YUYU PHARMA 2PB,KO:YIN,000227.KS,6/27/2000,2000,Equities,Korea,South Korea,South Korean Won,Pharmaceuticals and Biotechnology,Yuyu Pharma 2PB,Active,2022-12-28,17550.0,15100.0,15200.0,15100.0,49.0
