In [1]:
import pandas as pd

merged_df_up = pd.DataFrame(columns=["ngram", "up_count"])
merged_df_down = pd.DataFrame(columns=["ngram", "down_count"])

file_names_up = [f"./counting_{i}_상승.csv" for i in range(7)]
file_names_down = [f"./counting_{i}_하락.csv" for i in range(7)]  

def merge_and_accumulate(file_names, merged_df, column_name):
    for file_name in file_names:
        df = pd.read_csv(file_name,skiprows=[0], names=["ngram", column_name] )
        merged_df = pd.concat([merged_df, df], ignore_index=True)
    merged_df = merged_df.groupby('ngram')[column_name].sum().reset_index().sort_values(by=column_name, ascending=False, ignore_index=True)
    return merged_df

In [2]:
merged_df_up = merge_and_accumulate(file_names_up, merged_df_up, 'up_count')
merged_df_down = merge_and_accumulate(file_names_down, merged_df_down, 'down_count')

In [4]:
merged_df_up

Unnamed: 0,ngram,up_count
0,금리,273918
1,있,273537
2,보,188307
3,자세히,168130
4,자세히;보,161589
...,...,...
44003769,법;복잡;임대;기간,1
44003770,법;복잡;임대;기간;지났,1
44003771,법;복합,1
44003772,법;복합;작용,1


In [3]:
merged_df_down

Unnamed: 0,ngram,down_count
0,있,485497
1,금리,478320
2,보,332385
3,자세히,295024
4,일,286383
...,...,...
75447820,벗어나;소비;밀;올리;매우,1
75447821,벗어나;소비;심리,1
75447822,벗어나;소비;심리;회복,1
75447823,벗어나;소비;심리;회복;미국,1


In [4]:
merged_df_up.to_csv('./ngram_count_up.csv', index=False)
merged_df_down.to_csv('./ngram_count_down.csv', index=False)

---
상승, 하락 파일 병합하고 min_count 15 적용하기

In [2]:
import pandas as pd

df1 = pd.read_csv('./ngram_count_up.csv')
df2 = pd.read_csv('./ngram_count_down.csv')

merged_df = df1.merge(df2, on='ngram', how='outer')
# merged_df['up_count'] = merged_df['up_count'].fillna(0).astype(int)
# merged_df['down_count'] = merged_df['down_count'].fillna(0).astype(int)
# merged_df['sum_count'] = merged_df['up_count'] + merged_df['down_count']

# filtered_df = merged_df[merged_df['sum_count'] >= 15]

# filtered_df = filtered_df[['ngram', 'up_count', 'down_count']]
# filtered_df.to_csv('./filtered_file.csv', index=False)


In [3]:
merged_df

Unnamed: 0,ngram,up_count,down_count
0,금리,273918.0,478320
1,있,273537.0,485497
2,보,188307.0,332385
3,자세히,168130.0,295024
4,자세히;보,161589.0,283877
...,...,...,...
75447820,벗어나;셀,,1
75447821,벗어나;셀;트리,,1
75447822,벗어나;셀;트리;온의,,1
75447823,벗어나;셀;트리;온의;이날,,1


In [4]:
merged_df['up_count'] = merged_df['up_count'].fillna(0).astype(int)
merged_df['down_count'] = merged_df['down_count'].fillna(0).astype(int)
merged_df['sum_count'] = merged_df['up_count'] + merged_df['down_count']

In [5]:
merged_df

Unnamed: 0,ngram,up_count,down_count,sum_count
0,금리,273918,478320,752238
1,있,273537,485497,759034
2,보,188307,332385,520692
3,자세히,168130,295024,463154
4,자세히;보,161589,283877,445466
...,...,...,...,...
75447820,벗어나;셀,0,1,1
75447821,벗어나;셀;트리,0,1,1
75447822,벗어나;셀;트리;온의,0,1,1
75447823,벗어나;셀;트리;온의;이날,0,1,1


In [6]:
merged_df.to_csv('./test.csv', index=False)

In [2]:
merged_df

Unnamed: 0,ngram,up_count,down_count
0,금리,273918.0,478320
1,있,273537.0,485497
2,보,188307.0,332385
3,자세히,168130.0,295024
4,자세히;보,161589.0,283877
...,...,...,...
75447820,벗어나;셀,,1
75447821,벗어나;셀;트리,,1
75447822,벗어나;셀;트리;온의,,1
75447823,벗어나;셀;트리;온의;이날,,1


In [3]:
merged_df['up_count'].fillna(0, inplace=True)
merged_df['down_count'].fillna(0, inplace=True)

In [4]:
merged_df

Unnamed: 0,ngram,up_count,down_count
0,금리,273918.0,478320
1,있,273537.0,485497
2,보,188307.0,332385
3,자세히,168130.0,295024
4,자세히;보,161589.0,283877
...,...,...,...
75447820,벗어나;셀,0.0,1
75447821,벗어나;셀;트리,0.0,1
75447822,벗어나;셀;트리;온의,0.0,1
75447823,벗어나;셀;트리;온의;이날,0.0,1


In [6]:
merged_df['sum_count'] = merged_df['up_count'] + merged_df['down_count']

In [7]:
merged_df

Unnamed: 0,ngram,up_count,down_count,sum_count
0,금리,273918.0,478320,752238.0
1,있,273537.0,485497,759034.0
2,보,188307.0,332385,520692.0
3,자세히,168130.0,295024,463154.0
4,자세히;보,161589.0,283877,445466.0
...,...,...,...,...
75447820,벗어나;셀,0.0,1,1.0
75447821,벗어나;셀;트리,0.0,1,1.0
75447822,벗어나;셀;트리;온의,0.0,1,1.0
75447823,벗어나;셀;트리;온의;이날,0.0,1,1.0


In [8]:
filtered_df = merged_df[merged_df['sum_count'] >= 15]

In [9]:
filtered_df

Unnamed: 0,ngram,up_count,down_count,sum_count
0,금리,273918.0,478320,752238.0
1,있,273537.0,485497,759034.0
2,보,188307.0,332385,520692.0
3,자세히,168130.0,295024,463154.0
4,자세히;보,161589.0,283877,445466.0
...,...,...,...,...
44027085,훌쩍;다음;추천;언제;욕실,0.0,15,15.0
44027086,공공기관;기능;전면;재검토;무단,0.0,15,15.0
44027087,임원;전체;사직서;제출;무단,0.0,15,15.0
44027088,우호적;태도,0.0,15,15.0


In [13]:
filtered_df

Unnamed: 0,ngram,up_count,down_count,sum_count
0,금리,273918,478320,752238.0
1,있,273537,485497,759034.0
2,보,188307,332385,520692.0
3,자세히,168130,295024,463154.0
4,자세히;보,161589,283877,445466.0
...,...,...,...,...
44027085,훌쩍;다음;추천;언제;욕실,0,15,15.0
44027086,공공기관;기능;전면;재검토;무단,0,15,15.0
44027087,임원;전체;사직서;제출;무단,0,15,15.0
44027088,우호적;태도,0,15,15.0


In [14]:
filtered_df = filtered_df[['ngram', 'up_count', 'down_count']]


In [18]:
filtered_df.reset_index(inplace=True)

In [21]:
filtered_df

Unnamed: 0,index,ngram,up_count,down_count
0,0,금리,273918,478320
1,1,있,273537,485497
2,2,보,188307,332385
3,3,자세히,168130,295024
4,4,자세히;보,161589,283877
...,...,...,...,...
1009823,44027085,훌쩍;다음;추천;언제;욕실,0,15
1009824,44027086,공공기관;기능;전면;재검토;무단,0,15
1009825,44027087,임원;전체;사직서;제출;무단,0,15
1009826,44027088,우호적;태도,0,15


In [22]:
filtered_df = filtered_df[['ngram', 'up_count', 'down_count']]

In [23]:
filtered_df

Unnamed: 0,ngram,up_count,down_count
0,금리,273918,478320
1,있,273537,485497
2,보,188307,332385
3,자세히,168130,295024
4,자세히;보,161589,283877
...,...,...,...
1009823,훌쩍;다음;추천;언제;욕실,0,15
1009824,공공기관;기능;전면;재검토;무단,0,15
1009825,임원;전체;사직서;제출;무단,0,15
1009826,우호적;태도,0,15


In [24]:
filtered_df.to_csv('./ngram_counts_merged_and_filtered.csv', index=False)