-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
150 lines (116 loc) · 4.66 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from gnews import GNews
from datetime import datetime
import pandas as pd
from googletranslate import translate
import requests
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import joblib
from classify import predict_using_maibert
def translate_text(x):
try:
translation_result = translate(x, dest="mai", src="en")
return translation_result
except requests.exceptions.RequestException as e:
print(f"Error making translation request: {e}")
return (
None # Handle the error gracefully, you can modify this based on your needs
)
except json.decoder.JSONDecodeError as e:
print(f"Error decoding JSON response: {e}")
return (
None # Handle the error gracefully, you can modify this based on your needs
)
google_news = GNews()
nepal_news = google_news.get_news("nepal")
# Assuming nepal_news is a list of dictionaries
for news in nepal_news:
# Parse date string to datetime object
news["published date"] = datetime.strptime(
news["published date"], "%a, %d %b %Y %H:%M:%S %Z"
)
# Sort the news by date
nepal_news.sort(key=lambda x: x["published date"])
print("--------------Sorted news by date--------------")
# Assuming nepal_news is your list of dictionaries
df = pd.json_normalize(nepal_news)
print("-----------Normalized news------------")
# Get the full article for the last 30 news
max_length_of_news = 20
# Get the full article for the last 20 news and Images
df["full_article"] = (
df.tail(max_length_of_news)["url"]
.apply(google_news.get_full_article)
.apply(lambda x: x.text if x is not None else "")
)
df["images"] = (
df.tail(max_length_of_news)["url"]
.apply(google_news.get_full_article)
.apply(lambda x: x.images if x is not None else "")
)
print("--------Got full article and images----------")
# Remove non-ascii characters
df["full_article"] = df.tail(max_length_of_news)["full_article"].apply(
lambda x: "".join(i for i in str(x) if ord(i) < 0x10000)
)
print("--------Removed non-ascii characters----------")
# Translate the full article to maithili
# df["translated"] = df.tail(max_length_of_news)["full_article"].apply(
# lambda x: translate(x, dest="mai", src="en")
# )
# Assuming 'max_length_of_news' is defined
df["translated"] = df.tail(max_length_of_news)["full_article"].apply(translate_text)
print("--------Translated to maithili---------")
# Save the dataframe to csv
df.tail(max_length_of_news).to_csv("today_news.csv", index=False)
print("--------Saved to csv--------")
# print("---------Applying Machine Learning--------")
# # loading joblib file model
# tfidi_model = joblib.load("model/tfidf_model.joblib")
# label_encoder = joblib.load("model/label_model.joblib")
# svm_model = joblib.load("model/svc_77_classifier_model.joblib")
# print("---------Loaded Machine Learning Model--------")
# droping null values according to translated column
df = df.dropna(subset=["translated"])
print("---------Dropped null values--------")
# # applying tfidf vectorizer to translated column
# tfidf = tfidi_model.transform(df["translated"])
# print("---------Applied tfidf vectorizer--------")
# # predicting the label of translated column
# label = svm_model.predict(tfidf)
# print("---------Predicted the label--------")
# # encoding the label
# label = label_encoder.inverse_transform(label)
# print("---------Encoded the label--------")
# # adding label column to main dataframe
# df["label"] = label
# print("---------Added label column--------")
# applying deeplearning model to predict the label
print("---------Applying deeplearning model--------")
df = predict_using_maibert(df) # calling the function from classify.py
print("---------Predicted the label--------")
# making main datafram for store and retaive data
main_df = pd.read_csv("filename.csv", on_bad_lines="skip")
# merging two dataframe to make one main dataframe
merged_df = (
pd.concat([main_df, df.tail(30)])
.drop_duplicates(subset=["title"])
.reset_index(drop=True)
)
print("---------Merged two dataframe--------")
print("----------Removing Nan Column -----------")
merged_df = merged_df.dropna(subset=["translated"])
# saving the main dataframe to csv
merged_df.to_csv("filename.csv", index=False)
print("Done")
# save merged_df to json
merged_df.to_json("docs/filename.json", orient="records")
print("json file created successfully")
# save last 200 news to json
merged_df.tail(200).to_json("docs/last_100_news.json", orient="records")
print("json file created successfully")
# save last 200 to csv
merged_df.tail(200).to_csv("last_200_news.csv", index=False)
print("Done")