In [None]:
-import pandas as pd
import numpy as np
import re
import string
import random
from tqdm.auto import tqdm
import torch
import transformers
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from transformers import AdamW, get_cosine_schedule_with_warmup
from transformers import pipeline, set_seed

In [None]:
set_seed(42)

In [None]:
df = pd.read_csv("/kaggle/input/summary-data-all-columns/Summary_data_all_columns.csv")

In [None]:
df.head(3)

Unnamed: 0,car name,year,Pros,Cons,Full Description,base msrp,engine,drivetrain,dimensions,Full Name,GPT-2 Summarization,horsepower,engine_type,Torque,Torque_RPM,drive_type,transmission,width,Length,Height
0,jeep wrangler,1998,"['Unmatched off-road capability, overflowing w...","['Soft top is fun, but still a pain in the $%!...",Jeep has improved off-road capability by incre...,,"Inline 4 cylinder \n Horsepower: 120 hp @ 5,40...",Type: four wheel drive \n Transmission: 5-spee...,Length: 147.7 in. / Height: 69.6 in. \n Overal...,jeep wrangler 1998,Jeep has improved off-road capability by incre...,120,Inline 4,140 lb-ft,3500,four wheel drive,5-speed manual,66.7,147.7 in.,69.6 in.
1,toyota 4runner,2005,"['Powerful engine lineup, well mannered on pav...","[""Cargo capacity isn't much more than what man...",The base V6 now comes with a five-speed automa...,"$27,795","V6 cylinder \n Horsepower: 245 hp @ 5,200 rpm ...",Type: rear wheel drive \n Transmission: 5-spee...,Length: 189.0 in. / Height: 68.5 in. \n Overal...,toyota 4runner 2005,The base V6 now comes with a five-speed automa...,245,V6,282 lb-ft,3800,rear wheel drive,5-speed automatic,73.8,189.0 in.,68.5 in.
2,toyota tundra,2002,"['Silky V8, Toyota build quality, less-than-fu...",['Lacks wide range of choice offered by domest...,"SR5 models have new 16-inch wheels, and a limi...","$15,605","V6 cylinder \n Horsepower: 190 hp @ 4,800 rpm ...",Type: rear wheel drive \n Transmission: 5-spee...,Length: 217.5 in. / Height: 70.5 in. \n Bed Le...,toyota tundra 2002,"SR5 models have new 16-inch wheels, and a limi...",190,V6,220 lb-ft,3600,rear wheel drive,5-speed manual,75.2,217.5 in.,70.5 in.


In [None]:
df["base msrp"] = df["base msrp"].str.replace(" ", "")

In [None]:
df["transmission"] = df["transmission"].str.replace(" ", "")

In [None]:
data_QAS=df.drop(columns=['Full Description','engine','drivetrain','dimensions','car name','year'])

In [None]:
data_QAS['base msrp'] = data_QAS['base msrp'].str.replace('N/A', 'not available')

In [None]:
data_QAS.head(2)

Unnamed: 0,Pros,Cons,base msrp,Full Name,GPT-2 Summarization,horsepower,engine_type,Torque,Torque_RPM,drive_type,transmission,width,Length,Height
0,"['Unmatched off-road capability, overflowing w...","['Soft top is fun, but still a pain in the $%!...",not available,jeep wrangler 1998,Jeep has improved off-road capability by incre...,120,Inline 4,140 lb-ft,3500,four wheel drive,5-speedmanual,66.7,147.7 in.,69.6 in.
1,"['Powerful engine lineup, well mannered on pav...","[""Cargo capacity isn't much more than what man...","$27,795",toyota 4runner 2005,The base V6 now comes with a five-speed automa...,245,V6,282 lb-ft,3800,rear wheel drive,5-speedautomatic,73.8,189.0 in.,68.5 in.


In [None]:
def generate_context(row):
    engine_str = f"a {row['engine_type']}engine that delivers {row['horsepower']} horsepower and {row['Torque']} torque at {row['Torque_RPM']}"
    dimension_str = f"{row['Length']} in length, {row['Height']} in height, and {row['width']} in width "
    context_str = f"The {row['Full Name']} is a vehicle with {engine_str}. It has {row['drive_type']}and a {row['transmission']}. The car's dimensions are {dimension_str}. Additionally, the base MSRP of this car is {row['base msrp']}.\n Now an in depth review of the {row['Full Name']}:"
    return context_str

In [None]:
data_QAS['Context'] = data_QAS.apply(generate_context, axis=1)

In [None]:
data_QAS.loc[2000,'Context']

"The subaru legacy 2012 is a vehicle with a Flat 4 engine that delivers 170 horsepower and 170 lb-ft torque at 4,000. It has all wheel drive  and a 6-speedmanual. The car's dimensions are 186.4 in. in length, 59.3 in. in height, and 71.7 in width . Additionally, the base MSRP of this car is $19,995.\n Now an in depth review of the subaru legacy 2012:"

In [None]:
data_QAS['Context'] = data_QAS['Context'].str.strip()

In [None]:
data_QAS.loc[15,'Context']

"The honda accord 1999 is a vehicle with a V6 engine that delivers 200 horsepower and 195 lb-ft torque at 4,700. It has front wheel drive  and a 4-speedautomatic. The car's dimensions are 188.8 in. in length, 56.9 in. in height, and 70.3 in width . Additionally, the base MSRP of this car is not available.\n Now an in depth review of the honda accord 1999:"

In [None]:
data_QAS['final_context'] = data_QAS['Context'] + '\n\n' + data_QAS['GPT-2 Summarization']

In [None]:
data_QAS.loc[15,'final_context']

"The honda accord 1999 is a vehicle with a V6 engine that delivers 200 horsepower and 195 lb-ft torque at 4,700. It has front wheel drive  and a 4-speedautomatic. The car's dimensions are 188.8 in. in length, 56.9 in. in height, and 70.3 in width . Additionally, the base MSRP of this car is not available.\n Now an in depth review of the honda accord 1999:\n\nFor the 1999 Honda Accord, coupes remain unchanged after their recent overhaul, but the sedans receive new seat fabric, and the LX and EX sedans now feature fold-away side mirrors. The benchmark.  The best-selling car in America.  The highest resale value in its class.  These are all statements that have been made with regularity concerning the Honda Accord, a vehicle that is always on the short list of the most popular cars in this country.  The Accord won a loyal base of customers by offering sprightly performance, room for four, frugal fuel economy and a virtual guarantee that, if cared for properly, the Accord would not break. 

In [None]:
data_QAS.loc[2000,'final_context']

"The subaru legacy 2012 is a vehicle with a Flat 4 engine that delivers 170 horsepower and 170 lb-ft torque at 4,000. It has all wheel drive  and a 6-speedmanual. The car's dimensions are 186.4 in. in length, 59.3 in. in height, and 71.7 in width . Additionally, the base MSRP of this car is $19,995.\n Now an in depth review of the subaru legacy 2012:\n\nBesides a new audio system for upper trim levels and some minor shuffling of features, the Subaru Legacy returns largely unchanged for 2012. But  some of what makes it unique also serves to limit its appeal. On the downside, the Subaru Legacy's weight and reduced  efficiency from routing power to all four wheels results in less favorable fuel  economy numbers. Its road-holding prowess in inclement weather is a plus, for  sure, but if you live in milder climates, it may be harder to justify. Furthermore, the sporty 2.5 GT Limited is not available with an automatic  transmission, and a rather fussy navigation system is an option only for 

In [None]:
data_QAS.to_csv('/kaggle/working/Summary_Final_context.csv', index=False)

In [None]:
final_df = pd.DataFrame()

In [None]:
def generate_question(row):
    question_list = [
        "What is the horsepower rating of the engine in the {0}?".format(row['Full Name']),
        "How much horsepower does the {0}'s engine produce?".format(row['Full Name']),
        "What is the horsepower output of the {0}'s engine?".format(row['Full Name']),
        "Can you tell me the horsepower of the engine in the {0}?".format(row['Full Name']),
        "How many horsepower does the {0}'s engine generate?".format(row['Full Name'])
    ]
    question = random.choice(question_list)
    answer = str(row['horsepower'])
    context = row['final_context'].strip()
    start_pos = context.find(answer)
    end_pos = start_pos +len(answer)+len("horsepower")
    return question, answer, start_pos, end_pos, context

In [None]:
QAS = pd.DataFrame(columns=['question', 'answer', 'start_pos', 'end_pos', 'final_context'])

In [None]:
QAS = data_QAS.apply(generate_question, axis=1, result_type='expand')
QAS.columns = ['question', 'answer', 'start_pos', 'end_pos', 'context']

In [None]:
QAS.loc[1,'context']

"The toyota 4runner 2005 is a vehicle with a V6 engine that delivers 245 horsepower and 282 lb-ft torque at 3,800. It has rear wheel drive  and a 5-speedautomatic. The car's dimensions are 189.0 in. in length, 68.5 in. in height, and 73.8 in width . Additionally, the base MSRP of this car is $27,795.\n Now an in depth review of the toyota 4runner 2005:\n\nThe base V6 now comes with a five-speed automatic, while the optional V8 has been upgraded to deliver 270-hp and 330 pound-feet of torque. A rollover sensor now comes with the optional head curtain airbag system. A limited-slip differential has been added to the stability control system. SR5 models now feature a chrome grille, color-keyed bumpers and black running boards, while the Sport model get a color-keyed grille. All Limiteds receive color-keyed bumpers, illuminated black running boards and a black roof rack. Highly capable whether on the pavement or in the dirt, the 4Runner is a well-rounded midsize SUV, and one of our favorite

In [None]:
QAS

Unnamed: 0,question,answer,start_pos,end_pos,context
0,What is the horsepower rating of the engine in...,120,73,86,The jeep wrangler 1998 is a vehicle with a Inl...
1,What is the horsepower output of the toyota 4r...,245,68,81,The toyota 4runner 2005 is a vehicle with a V6...
2,What is the horsepower rating of the engine in...,190,67,80,The toyota tundra 2002 is a vehicle with a V6 ...
3,What is the horsepower output of the toyota ta...,142,73,86,The toyota tacoma 1998 is a vehicle with a Inl...
4,How many horsepower does the ford ranger 1990'...,100,71,84,The ford ranger 1990 is a vehicle with a Inlin...
...,...,...,...,...,...
3136,Can you tell me the horsepower of the engine i...,177,73,86,The jeep renegade 2023 is a vehicle with a Inl...
3137,What is the horsepower output of the bmw alpin...,600,67,80,The bmw alpina-b7 2022 is a vehicle with a V8 ...
3138,Can you tell me the horsepower of the engine i...,237,78,91,The chevrolet colorado 2022 is a vehicle with ...
3139,What is the horsepower rating of the engine in...,382,72,85,The bmw 4-series 2023 is a vehicle with a Inli...


In [None]:
data_QAS['final_context']

0       The jeep wrangler 1998 is a vehicle with a Inl...
1       The toyota 4runner 2005 is a vehicle with a V6...
2       The toyota tundra 2002 is a vehicle with a V6 ...
3       The toyota tacoma 1998 is a vehicle with a Inl...
4       The ford ranger 1990 is a vehicle with a Inlin...
                              ...                        
3136    The jeep renegade 2023 is a vehicle with a Inl...
3137    The bmw alpina-b7 2022 is a vehicle with a V8 ...
3138    The chevrolet colorado 2022 is a vehicle with ...
3139    The bmw 4-series 2023 is a vehicle with a Inli...
3140    The ford f-250-super-duty 2022 is a vehicle wi...
Name: final_context, Length: 3141, dtype: object

In [None]:
print(QAS.iloc[1]['context'].split()[68])

now


In [None]:
my_word = "245"
row_position = QAS.loc[1, "context"].find(my_word)

In [None]:
row_position

68

In [None]:
my_word = "horsepower"
row_position = QAS.loc[1, "context"].find(my_word)

In [None]:
row_position

72

In [None]:
QAS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3141 entries, 0 to 3140
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   question   3141 non-null   object
 1   answer     3141 non-null   object
 2   start_pos  3141 non-null   int64 
 3   end_pos    3141 non-null   int64 
 4   context    3141 non-null   object
dtypes: int64(2), object(3)
memory usage: 122.8+ KB


In [None]:
final_df = pd.concat([final_df, QAS], ignore_index=True)

In [None]:
final_df.tail(3)

Unnamed: 0,question,answer,start_pos,end_pos,context
3138,Can you tell me the horsepower of the engine i...,237,78,91,The chevrolet colorado 2022 is a vehicle with ...
3139,What is the horsepower rating of the engine in...,382,72,85,The bmw 4-series 2023 is a vehicle with a Inli...
3140,How many horsepower does the ford f-250-super-...,385,75,88,The ford f-250-super-duty 2022 is a vehicle wi...


In [None]:
def generate_question(row):
    question_list = [
        "What type of engine does the {0} have?".format(row['Full Name']),
        "What configuration does the engine in the {0} use?".format(row['Full Name']),
        "How many cylinders does the engine in the {0} utilize?".format(row['Full Name']),
        "What is the displacement of the engine in the {0}?".format(row['Full Name'])
    ]
    question = random.choice(question_list)
    answer = str(row['engine_type'])
    context = row['final_context'].strip()
    start_pos = context.find(answer)
    end_pos = start_pos + len(answer)+len("engine")# get the last character position of the engine word
    return question, answer, start_pos, end_pos, context

In [None]:
QAS = data_QAS.apply(generate_question, axis=1, result_type='expand')
QAS.columns = ['question', 'answer', 'start_pos', 'end_pos', 'context']

In [None]:
QAS.tail(3)

Unnamed: 0,question,answer,start_pos,end_pos,context
3138,How many cylinders does the engine in the chev...,Inline 4,48,63,The chevrolet colorado 2022 is a vehicle with ...
3139,What type of engine does the bmw 4-series 2023...,Inline 6,42,57,The bmw 4-series 2023 is a vehicle with a Inli...
3140,What is the displacement of the engine in the ...,V8,51,60,The ford f-250-super-duty 2022 is a vehicle wi...


In [None]:
print(QAS.iloc[3138]['context'][63])

 


In [None]:
QAS

Unnamed: 0,question,answer,start_pos,end_pos,context
0,What type of engine does the jeep wrangler 199...,Inline 4,43,58,The jeep wrangler 1998 is a vehicle with a Inl...
1,How many cylinders does the engine in the toyo...,V6,44,53,The toyota 4runner 2005 is a vehicle with a V6...
2,What is the displacement of the engine in the ...,V6,43,52,The toyota tundra 2002 is a vehicle with a V6 ...
3,What type of engine does the toyota tacoma 199...,Inline 4,43,58,The toyota tacoma 1998 is a vehicle with a Inl...
4,What configuration does the engine in the ford...,Inline 4,41,56,The ford ranger 1990 is a vehicle with a Inlin...
...,...,...,...,...,...
3136,What type of engine does the jeep renegade 202...,Inline 4,43,58,The jeep renegade 2023 is a vehicle with a Inl...
3137,How many cylinders does the engine in the bmw ...,V8,43,52,The bmw alpina-b7 2022 is a vehicle with a V8 ...
3138,How many cylinders does the engine in the chev...,Inline 4,48,63,The chevrolet colorado 2022 is a vehicle with ...
3139,What type of engine does the bmw 4-series 2023...,Inline 6,42,57,The bmw 4-series 2023 is a vehicle with a Inli...


In [None]:
final_df = pd.concat([final_df, QAS], ignore_index=True)

In [None]:
def generate_question(row):
    question_list = [
        "How much does the {0} cost at retail price?".format(row['Full Name']),
        "What is the manufacturer's suggested retail price MSRP for the {0}?".format(row['Full Name']),
        "Can you tell me the MSRP for the {0}?".format(row['Full Name']),
        "What is the MSRP of the {0}?".format(row['Full Name'])
    ]
    question = random.choice(question_list)
    answer = str(row['base msrp'])
    context = row['final_context'].strip()
    end_pos = context.find(answer)+len(answer)
    start_pos = end_pos - len(answer) - len("the base MSRP of this car is") # get the last character position of the MSRP word
    return question, answer, start_pos, end_pos, context

In [None]:
QAS = data_QAS.apply(generate_question, axis=1, result_type='expand')
QAS.columns = ['question', 'answer', 'start_pos', 'end_pos', 'context']

In [None]:
final_df = pd.concat([final_df, QAS], ignore_index=True)

In [None]:
final_df.tail(3)

Unnamed: 0,question,answer,start_pos,end_pos,context
9420,What is the MSRP of the chevrolet colorado 2022?,"$30,695",283,318,The chevrolet colorado 2022 is a vehicle with ...
9421,What is the manufacturer's suggested retail pr...,"$59,245",277,312,The bmw 4-series 2023 is a vehicle with a Inli...
9422,What is the MSRP of the ford f-250-super-duty ...,"$43,575",280,315,The ford f-250-super-duty 2022 is a vehicle wi...


In [None]:
final_df.isna().sum()

question     0
answer       0
start_pos    0
end_pos      0
context      0
dtype: int64

In [None]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9423 entries, 0 to 9422
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   question   9423 non-null   object
 1   answer     9423 non-null   object
 2   start_pos  9423 non-null   int64 
 3   end_pos    9423 non-null   int64 
 4   context    9423 non-null   object
dtypes: int64(2), object(3)
memory usage: 368.2+ KB


In [None]:
final_df.to_csv('/kaggle/working/Final_QAS_data_prefinetuning.csv', index=False)