In [None]:
import torch

print(torch.cuda.is_available())  # Should return True if CUDA is properly detected
print(torch.cuda.device_count())  # Number of available CUDA devices
print(torch.cuda.get_device_name(0))  # Name of the first CUDA device

In [None]:
import pandas as pd
import string
import langid
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

In [None]:
import os

print(os.getcwd())

In [None]:
df = pd.read_csv('data/raw/cleantech_media_dataset_v2_2024-02-23.csv')

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
# remove author column
df = df.drop(columns=['author'])

# rename column Unamed: 0 to doc_id
df = df.rename(columns={'Unnamed: 0': 'doc_id'})

In [None]:
# convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

In [None]:
df.head()

In [None]:
df['content'][0]

In [None]:
# Check if in the 'content' column are multiple lists per entry
df["content"].apply(lambda x: isinstance(x, list)).sum()

In [None]:
df["content"] = df["content"].apply(eval)

df = df.explode("content")

In [None]:
df.shape

In [None]:
# removing empty strings
(df['content'].str.strip() == '').sum()

In [None]:
df[df['content'].str.strip() == '']

In [None]:
df = df[(df["content"].str.strip() != "")]

In [None]:
# eliminating duplicate entries
df[df.duplicated()].sort_values("content")

In [None]:
df = df[~df.duplicated()]

# Advanced Text cleaning Process


In [None]:
from ftfy import fix_text, fix_encoding
from unstructured.cleaners.core import replace_unicode_quotes
from transformers import pipeline
import torch

In [None]:
torch.set_float32_matmul_precision('medium')

In [None]:
df['content'] = (
    df['content']
    .apply(fix_text)
    .apply(fix_encoding)
    .apply(replace_unicode_quotes)
)

In [None]:
print(df['content'][0])

In [None]:
# check what language the content is
df['language'] = df['content'].parallel_apply(lambda x: langid.classify(x)[0])
df['language'].value_counts()

In [None]:
df[df['language'] == 'fr']

In [None]:
# print all languages that are not english in a list
non_english = df[df["language"] != "en"]
non_english["language"].unique()

In [None]:
texts = [
    [
        "[ 1 ] see for example: harvey et al 2021, larson et al. 2020, haley et al. 2019, larsen et al. 2019 and ipcc 2018."
    ],
    [
        "qatar petroleum ( qp) is targeting aggressive cuts in its greenhouse gas emissions as it prepares to launch phase 2 of its planned 48 million ton per year lng expansion. in its latest sustainability report published on wednesday, qp said its goals include  reducing the emissions intensity of qatar's lng facilities by 25% and of its upstream facilities by at least 15%.  the company is also aiming to reduce gas flaring intensity across its upstream facilities by more than 75% and has raised its carbon capture and storage ambitions from 5 million tons/yr to 7 million tons/yr by 2027. about 2.2 million tons/yr of the carbon capture goal will come from the 32 million ton/yr phase 1 of the lng expansion, also known as the north field east project. a further 1.1 million tons/yr will come from phase 2, known as the north field south project, which will raise qatar's lng capacity by a further 16 million tons/yr. qatar currently has an lng production capacity of around 78 million tons/yr and is eyeing a phased expansion to 126 million tons/yr. qp says it should be able to eliminate routine gas flaring by 2030, with methane emissions limited  by setting a methane intensity target of 0.2% across all facilities by 2025.  the company also plans to build some 1.6 gigawatts of solar energy capacity by 2025, half of which should come from the siraj solar power project next year ( eif jan.22'20). until this month, there had been little news about phase 2 of qatar's massive lng expansion. but mcdermott international said last week that it had been awarded the front-end engineering and design contract for five offshore wellhead platforms ( lngi jan.12'21). bids for construction of all four trains for phase 1 of the lng expansion were submitted in september ( lngi sep.15'20). but qp judged them to be too expensive and none met its targeted 50-week construction schedule. shortlisted contractors were asked to look for cost savings and submit new bids. the contract, which consultancy rystad estimates to be worth around $ 35 billion, is expected to be awarded by mar. 31. shortly after the construction contract is awarded, qp is expected to select foreign investments partners to take stakes of up to 30% in the phase 1 trains. exxon mobil, royal dutch shell, total, chevron, conocophillips and eni have been shortlisted. qp has repeatedly said that it is prepared to proceed without international investment partners if it determines that the offers it receives are not sufficiently attractive. but the shortlisted companies are expected to bid aggressively for what is expected to be the world's lowest-cost and most environmentally friendly lng ( lngi nov.9'20). rafiq latta, nicosia"
    ],
    [
        "government actions in opposition to oil and gas introduce a range of potentially dangerous insecurities. we have been there, and done all of this before with oil and seen the consequences. that past experience, in large part, underlies notions of the criticality of minerals."
    ],
    [
        "a: we’ re not going to be anywhere near the pace and scale that we need to be in this clean energy transition. we need to accelerate even further with very robust, well thought-through government levers, funding streams, authorities and regulation, as well as private sector leadership."
    ],
    [
        "“ [ exploration is ] where there might be the best opportunity right now to really create some long-term substantial returns, because there’ s great opportunity, ” said apa ceo john christmann."
    ],
    [
        "“ i don't think it's going to be explicit, ” he said. “ i think it's not necessarily going to be the first or second thing. but probably the third [ or ] fourth thing. what we're observing is in the due diligence process, understanding if this deal is going to be accretive day one to the esg profile. ”"
    ],
    [
        "ørsted has taken a final investment decision ( fid) on its first renewable hydrogen project, with plans to launch the facility later this year."
    ],
    [
        "gradient comfort – about us [ online ] available at: https: //www.gradientcomfort.com/pages/about-us"
    ],
    ["image credit: mariskavegter/shutterstock.com"],
    ["related topics: carbon footprint renewable diesel utility"],
    ["4. zero emissions power"],
    ["برك المياه تقفل أوتوستراد # زوق مصبح # ملجق # لبنان pic.twitter.com/8njf85yq00"],
    ["https: //t.co/kmucrzhy6z pic.twitter.com/ovbxoxseju"],
    ["— patrick pouyanné ( @ ppouyanne) april 11, 2021"],
    [
        "( ofgem assume 2.9mwh per typical 🏠) ( ccc expect ⚡️increase for 🚗🚐🚛from 9 twh in 2022 to 47twh to 2030) https: //t.co/acoaky8tlj"
    ],
    ["ford mustang mach-e. photo by zach shahan | cleantechnica."],
    ["youtube: https: //www.youtube.com/c/bluettiofficial"],
    [
        "volkswagen id. buzz concept electric van, aka hippie bus. image courtesy of volkswagen."
    ],
    ["issn © 1532-1231 | issn © 2577-9877 | issn © 1532-1266 |"],
    ["your password *"],
    ["reset password"],
    ["decentralise"],
]

In [None]:
positive_labels = ["Text Paragraph", "Text Report", "Text Blog"]
negative_labels = ["Reference", "Link", "Topic Tags", "Image Source", "Image Credits", "Password", "EmailAddress",
                   "Cookie Consent", "Noisy Text", "Social Media Refernce", "Single Word", "Copyright"]

pos_score = lambda df: [sum(y[i] for i, t in enumerate(x) if t in positive_labels) for x, y in
                        zip(df["labels"], df["scores"])]

classes_verbalized = positive_labels + negative_labels

In [None]:
zeroshot_classifier_v3 = pipeline(
    "zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0", device_map="cuda"
)
zeroshot_classifier_m3 = pipeline(
    "zero-shot-classification", model="MoritzLaurer/bge-m3-zeroshot-v2.0", device_map="cuda"
)

# Initialize a list to store DataFrames
df_v3 = []
df_m3 = []

# Process each text with both models
for text in texts:
    output_v3 = zeroshot_classifier_v3(text, classes_verbalized, multi_label=False)

    # Create DataFrames from the outputs
    df_output_v3 = pd.DataFrame(output_v3)

    # Append each DataFrame to the list
    df_v3.append(df_output_v3)

    # add column "model"
    df_output_v3["model"] = "v3"

# Process each text with both models
for text in texts:
    output_m3 = zeroshot_classifier_m3(text, classes_verbalized, multi_label=False)

    # Create DataFrames from the outputs
    df_output_m3 = pd.DataFrame(output_m3)

    # Append each DataFrame to the list
    df_m3.append(df_output_m3)

    # add column "model"
    df_output_m3["model"] = "m3"

# Concatenate all DataFrames in the list
df_v3 = pd.concat(df_v3)
df_m3 = pd.concat(df_m3)

In [None]:
df_v3.to_csv('data/df_v3.csv', index=False)
df_m3.to_csv('data/df_m3.csv', index=False)

In [None]:
df_v3 = pd.read_csv('data/df_v3.csv')
df_m3 = pd.read_csv('data/df_m3.csv')

In [None]:
df_v3.info()

In [None]:
import pandas as pd
import ast
import matplotlib.pyplot as plt


# Assuming df_v3 and df_m3 are already defined and contain the data as described


# Function to convert string representation of lists into actual lists
def str_to_list(s):
    return ast.literal_eval(s)


# Convert string representations in both DataFrames
df_v3["labels"] = df_v3["labels"].apply(str_to_list)
df_v3["scores"] = df_v3["scores"].apply(str_to_list)
df_m3["labels"] = df_m3["labels"].apply(str_to_list)
df_m3["scores"] = df_m3["scores"].apply(str_to_list)

# Ensure both dataframes are aligned by sequence or by an appropriate key
# This step assumes the sequences align perfectly and are in the same order.
for idx in range(max(len(df_v3), len(df_m3))):
    # Print the sequence
    if idx < len(df_v3):
        print("Sequence from df_v3:", df_v3.iloc[idx]["sequence"])
    elif idx < len(df_m3):  # If no matching sequence in df_v3
        print("Sequence from df_m3:", df_m3.iloc[idx]["sequence"])

    plt.figure(figsize=(12, 6))  # Set the figure size for each pair of plots

    # Plot for model v3 if available
    if idx < len(df_v3):
        labels_v3 = df_v3.iloc[idx]["labels"]
        scores_v3 = df_v3.iloc[idx]["scores"]
        model_v3 = df_v3.iloc[idx]["model"]

        plt.subplot(1, 2, 1)  # Left plot for v3 model
        plt.bar(labels_v3, scores_v3, color="blue")
        plt.title(f"Model: {model_v3}")
        plt.xlabel("Labels")
        plt.ylabel("Scores")
        plt.xticks(rotation=45, ha="right")

    # Plot for model m3 if available
    if idx < len(df_m3):
        labels_m3 = df_m3.iloc[idx]["labels"]
        scores_m3 = df_m3.iloc[idx]["scores"]
        model_m3 = df_m3.iloc[idx]["model"]

        plt.subplot(1, 2, 2)  # Right plot for m3 model
        plt.bar(labels_m3, scores_m3, color="green")
        plt.title(f"Model: {model_m3}")
        plt.xlabel("Labels")
        plt.ylabel("Scores")
        plt.xticks(rotation=45, ha="right")

    plt.tight_layout(
        rect=[0, 0.03, 1, 0.95]
    )  # Adjust layout to make room for the main title
    plt.show()

In [None]:
print(df_v3["labels"][0])
print(df_v3["scores"][0])

In [None]:
df_v3["pos_prob"] = pos_score(df_v3)
df_m3["pos_prob"] = pos_score(df_m3)

In [None]:
df_v3

In [None]:
df_v3[df_v3["pos_prob"] > 0.6]["sequence"].tolist()

In [None]:
df_m3[df_m3["pos_prob"] > 0.5]["sequence"].tolist()

In [None]:
zeroshot_classifier_v3.model = zeroshot_classifier_v3.model.eval()

In [None]:
torch.cuda.empty_cache()

In [None]:
df['classification_prediction'] = zeroshot_classifier_v3(df['content'].tolist(),
                                                         classes_verbalized,
                                                         multi_label=False,
                                                         torch_dtype=torch.bfloat16,
                                                         batch_size=96)

In [None]:
df.to_parquet('data/classified.parquet')

In [None]:
data = pd.read_parquet('data/classified.parquet')

In [None]:
data.head(5)

In [None]:
data['labels'] = data['classification_prediction'].parallel_apply(lambda x: x['labels'])

In [None]:
data['scores'] = data['classification_prediction'].parallel_apply(lambda x: x['scores'])

In [None]:
data['pos_score'] = pos_score(data)

In [None]:
data = data[data["pos_score"] > 0.65]

In [None]:
data.shape

In [None]:
"""from tqdm import tqdm
from flair.data import Sentence
from flair.models import SequenceTagger
import json

tagger = SequenceTagger.load("flair/ner-english-ontonotes-large").eval()

t = [Sentence(x) for x in tqdm(data["content"].tolist()) if x]

w = [tagger.predict(x, mini_batch_size=32, return_probabilities_for_all_classes=True) for x in tqdm(t) if x]
w = [x.get_spans('ner').to_dict() for x in tqdm(w)]

data["ner"] = data["ner"].apply(lambda x: x.tolist()).apply(json.dumps)"""

In [None]:
from tqdm import tqdm
from flair.data import Sentence
from flair.models import SequenceTagger
import json

# Load the tagger and set to evaluation mode
tagger = SequenceTagger.load("flair/ner-english-ontonotes-large").eval()

# Create a list of Sentence objects if the content is not empty
t = [Sentence(x) for x in tqdm(data["content"].tolist()) if x]

# Predict in-place for each sentence, handling them in batches
for sentence in tqdm(t):
    tagger.predict(sentence, mini_batch_size=32, return_probabilities_for_all_classes=True)

# Convert the sentence data to dictionaries
w = [x.to_dict(tag_type='ner') for x in tqdm(t) if x.get_spans('ner')]

# Assuming you want to store these in a dataframe, you might do something like:
data["ner"] = [json.dumps([span.to_dict() for span in sentence.get_spans('ner')]) for sentence in t]

# Assuming the DataFrame already exists and you're appending new data:
data["ner"] = data["ner"].apply(json.loads)

In [None]:
from unstructured.cleaners import core
from functools import partial

data["content"] = data["content"].apply(
    partial(core.clean, extra_whitespace=True, dashes=True, bullets=True)
)

In [None]:
data.head(5)

In [None]:
print(data["content"][2])

In [None]:
data.to_parquet("data/processed/clean_cleantech.parquet", index=False)